#########################################################
## PROGRAM NAME: 004_munis_trts.R                      ##
## AUTHOR: MATT MLECZKO                                ##
## INPUTS:                                             ##
##        001_msa_del.Rda                              ##
##        002_tract_to_cs.Rda                          ##
##        002_tract_to_pl.Rda                          ##
##        003_msa_2009.Rda                             ##
##        003_msa_2022.Rda                             ##
##        places_popdensity_2009.csv                   ##
##        cousubs_popdensity_2009.csv                  ##
##        2022tractcrosswalk.csv                       ##
##        trtxwalk20102020.txt                         ##
##        pls2009.csv                                  ##
##                                                     ##
## OUTPUTS:                                            ##
##          004_allmunis_2009.Rda                      ##
##          004_allmunis_2022.Rda                      ##
##          004_hpov_muni_2009.Rda                     ##
##          004_hpov_muni_2022.Rda                     ##
##          004_hpov_tracts_2009.Rda                   ##
##          004_hpov_tracts_2022.Rda                   ##
##          004_hpov_tracts_alt_2009.Rda               ##
##          004_hpov_tracts_alt_2022.Rda               ##
##          004_hpov_tracts_mm_2009.Rda                ##
##          004_hpov_tracts_mm_2022.Rda                ##
##          004_msa_muni_geoids.Rda                    ##
##          004_npov_muni_2009.Rda                     ##
##          004_npov_muni_2022.Rda                     ##
##          004_npov_tracts_2009.Rda                   ##
##          004_npov_tracts_2022.Rda                   ##
##          004_npov_tracts_alt_2009.Rda               ##
##          004_npov_tracts_alt_2022.Rda               ##
##          004_npov_tracts_mm_2009.Rda                ##
##          004_npov_tracts_mm_2022.Rda                ##
##          004_trt2000_basevars.Rda                   ##
##          004_trt2012_basevars.Rda                   ##
##                                                     ## 
## PURPOSE: Download/process muni and tract            ##
##          level ACS data (2005-2009, 2018-2022);     ##
##          create and aggregate tract-level outcomes; ##
##          create muni-level file                     ##
##                                                     ##
## LIST OF UPDATES:                                    ##
#########################################################

#log <- file(# USER DEFINED PATH AND FILE NAME HERE #)
#sink(log, append=TRUE)
#sink(log, append=TRUE, type="message")

## load libraries ##

library(tidycensus)
library(tigris)
library(tidyr)
library(dplyr)
library(foreign)
library(stringr)
library(tm)
library(gdata)
library(gsubfn)
library(haven)
library(readxl)
library(data.table)

## define paths ##
input_path <- # USER DEFINED PATH HERE #
output_path <- # USER DEFINED PATH HERE #

## set working directory ##
setwd(input_path)

`%notin%` <- Negate(`%in%`)

## create a merge function that creates merge frequency as in Stata ##
## from user rwbuie at stackoverflow: https://stackoverflow.com/questions/30358401/is-there-a-way-to-create-statas-merge-indicator-variable-with-rs-merge ##
stata.merge <- function(x,y, name){
  x$df1 <- 1
  y$df2 <- 2
  df <- merge(x,y, by = name, all = TRUE)
  df$merge.variable <- rowSums(df[,c("df1", "df2")], na.rm=TRUE)
  df$df1 <- NULL
  df$df2<- NULL
  df
  #print(table(df$merge.variable))
  
  ## return the merged dataframe
  return(df)
}


## input Census key ##

#census_api_key(# USER CENSUS API KEY HERE #)

## states to loop through ##
states <- c("AL","AK","AZ","AR","CA","CO","CT","DE",
            "DC","FL","GA","HI","ID","IL","IN","IA","KS",
            "KY","LA","ME","MD","MA","MI","MN","MS","MO",
            "MT","NE","NV","NH","NJ","NM","NY","NC","ND",
            "OH","OK","OR","PA","RI","SC","SD","TN","TX",
            "UT","VT","VA","WA","WV","WI","WY")

load(paste0(output_path, "001_msa_del.Rda"))
load(paste0(output_path, "002_tract_to_cs.Rda"))
load(paste0(output_path, "002_tract_to_pl.Rda"))
load(paste0(output_path, "003_msa_2009.Rda"))
load(paste0(output_path, "003_msa_2022.Rda"))


## read-in 2009 pop density info ##

places.popdensity.2009.in <- read.csv("places_popdensity_2009.csv",
                                      header = T,
                                      stringsAsFactors = F)

cousubs.popdensity.2009.in <- read.csv("cousubs_popdensity_2009.csv",
                                      header = T,
                                      stringsAsFactors = F)

## clean the 2009 pop density info ## 

## places ##
places.popdensity.2009 <- places.popdensity.2009.in %>%
  select(Geo_FIPS,
         Geo_GEOID,
         Geo_NAME,
         Geo_STATE,
         Geo_COUNTY,
         Geo_COUSUB,
         Geo_PLACE,
         Geo_COUSUB,
         SE_A00002_001,
         SE_A00002_002,
         SE_A00002_003) %>%
  rename(total_pop = SE_A00002_001,
         area = SE_A00002_002,
         pop_density = SE_A00002_003) %>%
  filter(total_pop > 0 & Geo_STATE != "72")

places.popdensity.2009$Geo_STATE <- str_pad(places.popdensity.2009 $Geo_STATE, 2, pad = "0")
places.popdensity.2009$Geo_PLACE <- str_pad(places.popdensity.2009$Geo_PLACE, 5, pad = "0")
places.popdensity.2009$GEOID <- paste(places.popdensity.2009$Geo_STATE,
                                      places.popdensity.2009$Geo_PLACE,
                                      sep="")

## cosubs ##
cousubs.popdensity.2009 <- cousubs.popdensity.2009.in %>%
  select(Geo_FIPS,
         Geo_GEOID,
         Geo_NAME,
         Geo_COUNTY,
         Geo_STATE,
         Geo_COUSUB,
         Geo_PLACE,
         SE_A00002_001,
         SE_A00002_002,
         SE_A00002_003) %>%
  rename(total_pop = SE_A00002_001,
         area = SE_A00002_002,
         pop_density = SE_A00002_003) %>%
  filter(total_pop > 0 & Geo_STATE != "72" & !grepl("CCD|precinct", Geo_NAME)) %>%
  mutate(Geo_STATE = str_pad(Geo_STATE, 2, pad = "0"),
         Geo_COUNTY = str_pad(Geo_COUNTY, 3, pad = "0"),
         Geo_COUSUB = str_pad(Geo_COUSUB, 5, pad = "0"),
         GEOID_full = str_pad(Geo_FIPS, 10, pad = "0"))

load(paste0(output_path,
            "ltdb0010.Rda"))

ltdb.final$GEOID <- ltdb.final$trtid00

ltdb.fm <- select(ltdb.final,
                  trtid00,
                  trtid10,
                  weight,
                  GEOID)


## more variables for percentage growth vars ## 

## now, bring on 2000 median rent and median property values ## 

v2000 <- load_variables(2000, "sf3")

state.counter <- 1 
state.tracts.2000 <- list()

## start the loop ##
for (st in states){
  
  tr2000 <- get_decennial(geography = "tract", 
                          variables = c(mgr_2000 = "H063001",
                                        mpv_2000 = "H085001"),
                          state = st, 
                          survey = "sf3",
                          output = "wide",
                          year = 2000)
  
  tr2000$state <- st
  
  ## store the data frame in the list ## 
  state.tracts.2000[[state.counter]] <- tr2000
  
  ## increase interval by 1 ## 
  state.counter <- state.counter + 1
  
}


## combine all the data ## 
all.tracts.2000.cb <- bind_rows(state.tracts.2000)

## convert tracts to 2010 boundaries ##
## pre-merge checks ##

class(all.tracts.2000.cb$GEOID)
range(nchar(trim(all.tracts.2000.cb$GEOID)))
nrow(all.tracts.2000.cb) == length(unique(all.tracts.2000.cb$GEOID))

class(ltdb.fm$GEOID)
range(nchar(trim(ltdb.fm$GEOID)))

## merge the files ## 

all.tracts.2000.updated <- stata.merge(all.tracts.2000.cb,
                                       ltdb.fm,
                                       "GEOID")

## diagnose merge ## 
table(all.tracts.2000.updated$merge.variable)

## keep only the matches and obs with non-zero weights ##

all.tracts.2000.match <- all.tracts.2000.updated %>%
  filter(merge.variable == 3 &
         weight != 0) %>%
  select(-merge.variable,
         -ends_with("M")) %>%
  rename_with(~str_remove(., 'E')) %>%
  rename(GEOID = GOID)

summary(all.tracts.2000.match)

## change the GEOID ##
## some 2010 tract IDs are duplicated because of  ##
## tracts that changed from 2000 to 2010; these   ##
## tracts will be condensed into one observation  ##
## via a weighted average, with weights from LTDB ##

all.tracts.2000.pr <- all.tracts.2000.match %>%
  select(GEOID,
         trtid00,
         trtid10, 
         state,
         mgr_2000,
         mpv_2000,
         weight) %>%
  group_by(trtid10) %>%
  summarize_at(vars(mgr_2000:mpv_2000),
               funs(weighted.mean(., weight, na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID = trtid10) %>%
  ## inflation adjustment ## 
  mutate(mgr_2000 = mgr_2000 * 437.6/246.8,
         mpv_2000 = mpv_2000 * 437.6/246.8)

summary(all.tracts.2000.pr)


## correct GEOIDs according to Census guidance ## 
## https://www2.census.gov/geo/pdfs/reference/Geography_Notes.pdf ##

## Pima County ## 
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "04019002701"] <- "04019002704"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "04019002903"] <- "04019002906"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "04019410501"] <- "04019004118"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "04019410502"] <- "04019004121"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "04019410503"] <- "04019004125"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "04019470400"] <- "04019005200"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "04019470500"] <- "04019005300"

## LA County ##
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "06037930401"] <- "06037137000"

## NY ## 
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36053940101"] <- "36053030101"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36053940102"] <- "36053030102"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36053940103"] <- "36053030103"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36053940200"] <- "36053030200"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36053940300"] <- "36053030300"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36053940401"] <- "36053030401"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36053940403"] <- "36053030403"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36053940600"] <- "36053030600"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36053940700"] <- "36053030402"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36065940000"] <- "36065024800"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36065940100"] <- "36065024700"
all.tracts.2000.pr$GEOID[all.tracts.2000.pr$GEOID == "36065940200"] <- "36065024900"

## now, bring on 2008-2012 median rent and median property values ## 

v2012 <- load_variables(2012, "acs5")

state.counter <- 1 
state.tracts.2012 <- list()

## start the loop ##
for (st in states){
  
  tr2012 <- get_acs(geography = "tract", 
                    variables = c(mgr_2012 = "B25064_001",
                                  mpv_2012 = "B25077_001"),
                    state = st, 
                    survey = "acs5",
                    output = "wide",
                    year = 2012)
  
  tr2012$state <- st
  
  ## store the data frame in the list ## 
  state.tracts.2012[[state.counter]] <- tr2012
  
  ## increase interval by 1 ## 
  state.counter <- state.counter + 1
  
}


## combine all the data ## 
all.tracts.2012.cb <- bind_rows(state.tracts.2012) 

all.tracts.2012 <- all.tracts.2012.cb %>%
  select(GEOID,
         ends_with("E")) %>%
  rename(mgr_2012 = mgr_2012E,
         mpv_2012 = mpv_2012E) %>%
  ## inflation adjustment ## 
  mutate(mgr_2012 = mgr_2012 * 437.6/337,
         mpv_2012 = mpv_2012 * 437.6/337) %>%
  select(-NAME,
         -state)

summary(all.tracts.2012)

## correct GEOIDs according to Census guidance ## 
## https://www2.census.gov/geo/pdfs/reference/Geography_Notes.pdf ##

## Pima County ## 
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "04019002701"] <- "04019002704"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "04019002903"] <- "04019002906"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "04019410501"] <- "04019004118"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "04019410502"] <- "04019004121"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "04019410503"] <- "04019004125"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "04019470400"] <- "04019005200"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "04019470500"] <- "04019005300"

## LA County ##
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "06037930401"] <- "06037137000"

## NY ## 
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36053940101"] <- "36053030101"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36053940102"] <- "36053030102"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36053940103"] <- "36053030103"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36053940200"] <- "36053030200"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36053940300"] <- "36053030300"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36053940401"] <- "36053030401"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36053940403"] <- "36053030403"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36053940600"] <- "36053030600"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36053940700"] <- "36053030402"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36065940000"] <- "36065024800"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36065940100"] <- "36065024700"
all.tracts.2012$GEOID[all.tracts.2012$GEOID == "36065940200"] <- "36065024900"


save(all.tracts.2000.pr, 
     file = paste0(output_path,
                   "004_trt2000_basevars.Rda"))

save(all.tracts.2012, 
     file = paste0(output_path,
                   "004_trt2012_basevars.Rda"))


#########################################
## 2018-2022 tracts and municipalities ##
#########################################

v2022 <- load_variables(2022, "acs5", cache=T)

## initialize lists to store data frames ##
state.tracts.2022 <- list()
state.places.2022 <- list() 
state.cosubs.2022 <- list()
state.places.2022.area <- list()
state.cosubs.2022.area <- list()

## initialize counter ## 
state.counter <- 1

## start the loop ##
for (st in states){
  
  tr2022 <- get_acs(geography = "tract", 
                    variables = c(pop_total = "B01003_001",
                                  ro_hhlds = "B25003_003",
                                  rb_den = "B25070_001",
                                  rb_num1 = "B25070_007",
                                  rb_num2 = "B25070_008",
                                  rb_num3 = "B25070_009",
                                  rb_num4 = "B25070_010",
                                  median_gross_rent = "B25064_001",
                                  median_prop_value = "B25077_001",
                                  hhld_pov_den = "B17017_001",
                                  hhld_pov_num = "B17017_002",
                                  pop_nh_white = "B03002_003",
                                  pop_nh_black = "B03002_004",
                                  pop_nh_aian = "B03002_005",
                                  pop_nh_asian = "B03002_006",
                                  pop_nh_nhpi = "B03002_007",
                                  pop_nh_other = "B03002_008",
                                  pop_nh_multi = "B03002_009",
                                  pop_nh_multi_oth = "B03002_010",
                                  pop_nh_multi_noth ="B03002_011",
                                  pop_h = "B03002_012",
                                  gq_pop = "B26001_001"),
                    state = st, 
                    survey = "acs5",
                    output = "wide",
                    year = 2022)
  
  tr2022$state <- st
  
  ## store the data frame in the list ## 
  state.tracts.2022[[state.counter]] <- tr2022
  
  ## get data for places ##
  pl2022 <- get_acs(geography = "place", 
                     variables = c(pop_total = "B01003_001",
                                   race_den = "B02001_001",
                                   pop_white = "B02001_002",
                                   pop_black = "B02001_003",
                                   pop_aian = "B02001_004",
                                   pop_asian = "B02001_005",
                                   pop_nhpi = "B02001_006",
                                   pop_other = "B02001_007",
                                   pop_multi = "B02001_008",
                                   pop_multi_oth = "B02001_009",
                                   pop_multi_noth = "B02001_010",
                                   hisp_race_den = "B03002_001",
                                   pop_nh = "B03002_002",
                                   pop_nh_white = "B03002_003",
                                   pop_nh_black = "B03002_004",
                                   pop_nh_aian = "B03002_005",
                                   pop_nh_asian = "B03002_006",
                                   pop_nh_nhpi = "B03002_007",
                                   pop_nh_other = "B03002_008",
                                   pop_nh_multi = "B03002_009",
                                   pop_nh_multi_oth = "B03002_010",
                                   pop_nh_multi_noth ="B03002_011",
                                   pop_h = "B03002_012",
                                   pop_h_white = "B03002_013",
                                   pop_h_black = "B03002_014",
                                   pop_h_aian = "B03002_015",
                                   pop_h_asian = "B03002_016",
                                   pop_h_nhpi = "B03002_017",
                                   pop_h_other = "B03002_018",
                                   pop_h_multi = "B03002_019",
                                   pop_h_multi_oth = "B03002_020",
                                   pop_h_multi_noth = "B03002_021",
                                   age_den = "B01001_001",
                                   tot_male = "B01001_002",
                                   male_5u = "B01001_003",
                                   male_5t9 = "B01001_004",
                                   male_10t14 = "B01001_005",
                                   male_15t17 = "B01001_006",
                                   male_18t19 = "B01001_007",
                                   male_20 = "B01001_008",
                                   male_21 = "B01001_009",
                                   male_22t24 = "B01001_010",
                                   male_25t29 = "B01001_011",
                                   male_30t34 = "B01001_012",
                                   male_35t39 = "B01001_013",
                                   male_40t44 = "B01001_014",
                                   male_45t49 = "B01001_015",
                                   male_50t54 = "B01001_016",
                                   male_55t59 = "B01001_017",
                                   male_60t61 = "B01001_018",
                                   male_62t64 = "B01001_019",
                                   male_65t66 = "B01001_020",
                                   male_67t69 = "B01001_021",
                                   male_70t74 = "B01001_022",
                                   male_75t79 = "B01001_023",
                                   male_80t84 = "B01001_024",
                                   male_85over = "B01001_025",
                                   tot_female = "B01001_026",
                                   female_5u = "B01001_027",
                                   female_5t9 = "B01001_028",
                                   female_10t14 = "B01001_029",
                                   female_15t17 = "B01001_030",
                                   female_18t19 = "B01001_031",
                                   female_20 = "B01001_032",
                                   female_21 = "B01001_033",
                                   female_22t24 = "B01001_034",
                                   female_25t29 = "B01001_035",
                                   female_30t34 = "B01001_036",
                                   female_35t39 = "B01001_037",
                                   female_40t44 = "B01001_038",
                                   female_45t49 = "B01001_039",
                                   female_50t54 = "B01001_040",
                                   female_55t59 = "B01001_041",
                                   female_60t61 = "B01001_042",
                                   female_62t64 = "B01001_043",
                                   female_65t66 = "B01001_044",
                                   female_67t69 = "B01001_045",
                                   female_70t74 = "B01001_046",
                                   female_75t79 = "B01001_047",
                                   female_80t84 = "B01001_048",
                                   female_85over = "B01001_049",
                                   median_age = "B01002_001",
                                   hhlds_child_den = "B11005_001",
                                   hhlds_child_num = "B11005_002",
                                   hhld_move_den = "B07001_001",
                                   hhld_nomove = "B07001_017",
                                   foreign_den = "B05002_001",
                                   foreign_born = "B05002_013",
                                   hhs = "B25003_001",
                                   oo_hu = "B25003_002",
                                   ro_hu = "B25003_003",
                                   rb_den = "B25070_001",
                                   rb_num1 = "B25070_007",
                                   rb_num2 = "B25070_008",
                                   rb_num3 = "B25070_009",
                                   rb_num4 = "B25070_010",
                                   median_gross_rent = "B25064_001",
                                   median_prop_value = "B25077_001",
                                   median_yr_str = "B25035_001",
                                   total_hu = "B25002_001",
                                   total_vacant = "B25002_003",
                                   median_hhld_inc = "B19013_001",
                                   hhld_inc = "B19001_001",
                                   hhld_inc1 = "B19001_002",
                                   hhld_inc2 = "B19001_003",
                                   hhld_inc3 = "B19001_004",
                                   hhld_inc4 = "B19001_005",
                                   hhld_inc5 = "B19001_006",
                                   hhld_inc6 = "B19001_007",
                                   hhld_inc7 = "B19001_008",
                                   hhld_inc8 = "B19001_009",
                                   hhld_inc9 = "B19001_010",
                                   hhld_inc10 = "B19001_011",
                                   hhld_inc11 = "B19001_012",
                                   hhld_inc12 = "B19001_013",
                                   hhld_inc13 = "B19001_014",
                                   hhld_inc14 = "B19001_015",
                                   hhld_inc15 = "B19001_016",
                                   hhld_inc16 = "B19001_017",
                                   hhld_pov_den = "B17017_001",
                                   hhld_pov_num = "B17017_002",
                                   ed_den = "B15003_001",
                                   ed_none = "B15003_002",
                                   ed_nurse = "B15003_003",
                                   ed_k = "B15003_004",
                                   ed_1 = "B15003_005",
                                   ed_2 = "B15003_006",
                                   ed_3 = "B15003_007",
                                   ed_4 = "B15003_008",
                                   ed_5 = "B15003_009",
                                   ed_6 = "B15003_010",
                                   ed_7 = "B15003_011",
                                   ed_8 = "B15003_012",
                                   ed_9 = "B15003_013",
                                   ed_10 = "B15003_014",
                                   ed_11 = "B15003_015",
                                   ed_12_nd = "B15003_016",
                                   ed_12_d = "B15003_017",
                                   ed_ged = "B15003_018",
                                   ed_sc1l = "B15003_019",
                                   ed_sc1m = "B15003_020",
                                   ed_as = "B15003_021",
                                   ed_ba = "B15003_022",
                                   ed_ma = "B15003_023",
                                   ed_pd = "B15003_024",
                                   ed_phd = "B15003_025",
                                   gini = "B19083_001",
                                   mlf_1619 = "B23001_006",
                                   mup_1619 = "B23001_008",
                                   mlf_2021 = "B23001_013",
                                   mup_2021 = "B23001_015",
                                   mlf_2224 = "B23001_020",
                                   mup_2224 = "B23001_022",
                                   mlf_2529 = "B23001_027",
                                   mup_2529 = "B23001_029",
                                   mlf_3034 = "B23001_034",
                                   mup_3034 = "B23001_036",
                                   mlf_3544 = "B23001_041",
                                   mup_3544 = "B23001_043",
                                   mlf_4554 = "B23001_048",
                                   mup_4554 = "B23001_050",
                                   mlf_5559 = "B23001_055",
                                   mup_5559 = "B23001_057",
                                   mlf_6061 = "B23001_062",
                                   mup_6061 = "B23001_064",
                                   mlf_6264 = "B23001_069",
                                   mup_6264 = "B23001_071",
                                   mlf_6569 = "B23001_074",
                                   mup_6569 = "B23001_076",
                                   mlf_7074 = "B23001_079",
                                   mup_7074 = "B23001_081",
                                   mlf_75u = "B23001_084",
                                   mup_75u = "B23001_086",
                                   flp_1619 = "B23001_092",
                                   fup_1619 = "B23001_094",
                                   flp_2021 = "B23001_099",
                                   fup_2021 = "B23001_101",
                                   flp_2224 = "B23001_106",
                                   fup_2224 = "B23001_108",
                                   flp_2529 = "B23001_113",
                                   fup_2529 = "B23001_115",
                                   flp_3034 = "B23001_120",
                                   fup_3034 = "B23001_122",
                                   flp_3544 = "B23001_127",
                                   fup_3544 = "B23001_129",
                                   flp_4554 = "B23001_134",
                                   fup_4554 = "B23001_136",
                                   flp_5559 = "B23001_141",
                                   fup_5559 = "B23001_143",
                                   flp_6061 = "B23001_148",
                                   fup_6061 = "B23001_150",
                                   flp_6264 = "B23001_155",
                                   fup_6264 = "B23001_157",
                                   flp_6569 = "B23001_160",
                                   fup_6569 = "B23001_162",
                                   flp_7074 = "B23001_165",
                                   fup_7074 = "B23001_167",
                                   flp_75u = "B23001_170",
                                   fup_75u = "B23001_172"),
                     state = st, 
                     survey = "acs5",
                     output = "wide",
                     year = 2022)
  
  ## get land area info ##
  pl2022.area <- as.data.frame(places(state=st,
                                      year = 2022,
                                      cb=TRUE))
  
  
  ## add state identifying variable ##
  pl2022$state <- st
  pl2022.area$state <- st
  
  state.places.2022[[state.counter]] <- pl2022
  state.places.2022.area[[state.counter]] <- pl2022.area
  
  cosub2022 <- get_acs(geography = "county subdivision", 
                       variables = c(pop_total = "B01003_001",
                                     race_den = "B02001_001",
                                     pop_white = "B02001_002",
                                     pop_black = "B02001_003",
                                     pop_aian = "B02001_004",
                                     pop_asian = "B02001_005",
                                     pop_nhpi = "B02001_006",
                                     pop_other = "B02001_007",
                                     pop_multi = "B02001_008",
                                     pop_multi_oth = "B02001_009",
                                     pop_multi_noth = "B02001_010",
                                     hisp_race_den = "B03002_001",
                                     pop_nh = "B03002_002",
                                     pop_nh_white = "B03002_003",
                                     pop_nh_black = "B03002_004",
                                     pop_nh_aian = "B03002_005",
                                     pop_nh_asian = "B03002_006",
                                     pop_nh_nhpi = "B03002_007",
                                     pop_nh_other = "B03002_008",
                                     pop_nh_multi = "B03002_009",
                                     pop_nh_multi_oth = "B03002_010",
                                     pop_nh_multi_noth ="B03002_011",
                                     pop_h = "B03002_012",
                                     pop_h_white = "B03002_013",
                                     pop_h_black = "B03002_014",
                                     pop_h_aian = "B03002_015",
                                     pop_h_asian = "B03002_016",
                                     pop_h_nhpi = "B03002_017",
                                     pop_h_other = "B03002_018",
                                     pop_h_multi = "B03002_019",
                                     pop_h_multi_oth = "B03002_020",
                                     pop_h_multi_noth = "B03002_021",
                                     age_den = "B01001_001",
                                     tot_male = "B01001_002",
                                     male_5u = "B01001_003",
                                     male_5t9 = "B01001_004",
                                     male_10t14 = "B01001_005",
                                     male_15t17 = "B01001_006",
                                     male_18t19 = "B01001_007",
                                     male_20 = "B01001_008",
                                     male_21 = "B01001_009",
                                     male_22t24 = "B01001_010",
                                     male_25t29 = "B01001_011",
                                     male_30t34 = "B01001_012",
                                     male_35t39 = "B01001_013",
                                     male_40t44 = "B01001_014",
                                     male_45t49 = "B01001_015",
                                     male_50t54 = "B01001_016",
                                     male_55t59 = "B01001_017",
                                     male_60t61 = "B01001_018",
                                     male_62t64 = "B01001_019",
                                     male_65t66 = "B01001_020",
                                     male_67t69 = "B01001_021",
                                     male_70t74 = "B01001_022",
                                     male_75t79 = "B01001_023",
                                     male_80t84 = "B01001_024",
                                     male_85over = "B01001_025",
                                     tot_female = "B01001_026",
                                     female_5u = "B01001_027",
                                     female_5t9 = "B01001_028",
                                     female_10t14 = "B01001_029",
                                     female_15t17 = "B01001_030",
                                     female_18t19 = "B01001_031",
                                     female_20 = "B01001_032",
                                     female_21 = "B01001_033",
                                     female_22t24 = "B01001_034",
                                     female_25t29 = "B01001_035",
                                     female_30t34 = "B01001_036",
                                     female_35t39 = "B01001_037",
                                     female_40t44 = "B01001_038",
                                     female_45t49 = "B01001_039",
                                     female_50t54 = "B01001_040",
                                     female_55t59 = "B01001_041",
                                     female_60t61 = "B01001_042",
                                     female_62t64 = "B01001_043",
                                     female_65t66 = "B01001_044",
                                     female_67t69 = "B01001_045",
                                     female_70t74 = "B01001_046",
                                     female_75t79 = "B01001_047",
                                     female_80t84 = "B01001_048",
                                     female_85over = "B01001_049",
                                     median_age = "B01002_001",
                                     hhlds_child_den = "B11005_001",
                                     hhlds_child_num = "B11005_002",
                                     hhld_move_den = "B07001_001",
                                     hhld_nomove = "B07001_017",
                                     foreign_den = "B05002_001",
                                     foreign_born = "B05002_013",
                                     hhs = "B25003_001",
                                     oo_hu = "B25003_002",
                                     ro_hu = "B25003_003",
                                     rb_den = "B25070_001",
                                     rb_num1 = "B25070_007",
                                     rb_num2 = "B25070_008",
                                     rb_num3 = "B25070_009",
                                     rb_num4 = "B25070_010",
                                     median_gross_rent = "B25064_001",
                                     median_prop_value = "B25077_001",
                                     median_yr_str = "B25035_001",
                                     total_hu = "B25002_001",
                                     total_vacant = "B25002_003",
                                     median_hhld_inc = "B19013_001",
                                     hhld_inc = "B19001_001",
                                     hhld_inc1 = "B19001_002",
                                     hhld_inc2 = "B19001_003",
                                     hhld_inc3 = "B19001_004",
                                     hhld_inc4 = "B19001_005",
                                     hhld_inc5 = "B19001_006",
                                     hhld_inc6 = "B19001_007",
                                     hhld_inc7 = "B19001_008",
                                     hhld_inc8 = "B19001_009",
                                     hhld_inc9 = "B19001_010",
                                     hhld_inc10 = "B19001_011",
                                     hhld_inc11 = "B19001_012",
                                     hhld_inc12 = "B19001_013",
                                     hhld_inc13 = "B19001_014",
                                     hhld_inc14 = "B19001_015",
                                     hhld_inc15 = "B19001_016",
                                     hhld_inc16 = "B19001_017",
                                     hhld_pov_den = "B17017_001",
                                     hhld_pov_num = "B17017_002",
                                     ed_den = "B15003_001",
                                     ed_none = "B15003_002",
                                     ed_nurse = "B15003_003",
                                     ed_k = "B15003_004",
                                     ed_1 = "B15003_005",
                                     ed_2 = "B15003_006",
                                     ed_3 = "B15003_007",
                                     ed_4 = "B15003_008",
                                     ed_5 = "B15003_009",
                                     ed_6 = "B15003_010",
                                     ed_7 = "B15003_011",
                                     ed_8 = "B15003_012",
                                     ed_9 = "B15003_013",
                                     ed_10 = "B15003_014",
                                     ed_11 = "B15003_015",
                                     ed_12_nd = "B15003_016",
                                     ed_12_d = "B15003_017",
                                     ed_ged = "B15003_018",
                                     ed_sc1l = "B15003_019",
                                     ed_sc1m = "B15003_020",
                                     ed_as = "B15003_021",
                                     ed_ba = "B15003_022",
                                     ed_ma = "B15003_023",
                                     ed_pd = "B15003_024",
                                     ed_phd = "B15003_025",
                                     gini = "B19083_001",
                                     mlf_1619 = "B23001_006",
                                     mup_1619 = "B23001_008",
                                     mlf_2021 = "B23001_013",
                                     mup_2021 = "B23001_015",
                                     mlf_2224 = "B23001_020",
                                     mup_2224 = "B23001_022",
                                     mlf_2529 = "B23001_027",
                                     mup_2529 = "B23001_029",
                                     mlf_3034 = "B23001_034",
                                     mup_3034 = "B23001_036",
                                     mlf_3544 = "B23001_041",
                                     mup_3544 = "B23001_043",
                                     mlf_4554 = "B23001_048",
                                     mup_4554 = "B23001_050",
                                     mlf_5559 = "B23001_055",
                                     mup_5559 = "B23001_057",
                                     mlf_6061 = "B23001_062",
                                     mup_6061 = "B23001_064",
                                     mlf_6264 = "B23001_069",
                                     mup_6264 = "B23001_071",
                                     mlf_6569 = "B23001_074",
                                     mup_6569 = "B23001_076",
                                     mlf_7074 = "B23001_079",
                                     mup_7074 = "B23001_081",
                                     mlf_75u = "B23001_084",
                                     mup_75u = "B23001_086",
                                     flp_1619 = "B23001_092",
                                     fup_1619 = "B23001_094",
                                     flp_2021 = "B23001_099",
                                     fup_2021 = "B23001_101",
                                     flp_2224 = "B23001_106",
                                     fup_2224 = "B23001_108",
                                     flp_2529 = "B23001_113",
                                     fup_2529 = "B23001_115",
                                     flp_3034 = "B23001_120",
                                     fup_3034 = "B23001_122",
                                     flp_3544 = "B23001_127",
                                     fup_3544 = "B23001_129",
                                     flp_4554 = "B23001_134",
                                     fup_4554 = "B23001_136",
                                     flp_5559 = "B23001_141",
                                     fup_5559 = "B23001_143",
                                     flp_6061 = "B23001_148",
                                     fup_6061 = "B23001_150",
                                     flp_6264 = "B23001_155",
                                     fup_6264 = "B23001_157",
                                     flp_6569 = "B23001_160",
                                     fup_6569 = "B23001_162",
                                     flp_7074 = "B23001_165",
                                     fup_7074 = "B23001_167",
                                     flp_75u = "B23001_170",
                                     fup_75u = "B23001_172"),
                       state = st, 
                       survey = "acs5",
                       output = "wide",
                       year = 2022)
  
  ## get land area info ##
  cosub2022.area <- as.data.frame(county_subdivisions(state=st,
                                                      year = 2022,
                                                      cb=TRUE))
  
  ## add state identifying variable ##
  cosub2022$state <- st
  cosub2022.area$state <- st
  
  ## store the data frame in the list ## 
  state.cosubs.2022[[state.counter]] <- cosub2022
  state.cosubs.2022.area[[state.counter]] <- cosub2022.area
  
  ## increase interval by 1 ## 
  state.counter <- state.counter + 1
  
}

######################
## first 2022 munis ##
######################

all.pls.2022 <- bind_rows(state.places.2022)
all.cosubs.2022 <- bind_rows(state.cosubs.2022)
all.pls.2022.area <- bind_rows(state.places.2022.area)
all.cosubs.2022.area <- bind_rows(state.cosubs.2022.area)

## attach land area values - places ##

all.pls.2022.area.f <- all.pls.2022.area %>%
  select(GEOID, 
         ALAND,
         AWATER) %>%
  mutate(aland = as.numeric(ALAND),
         awater = as.numeric(AWATER),
         total_area = aland + awater) %>%
  select(GEOID,
         aland,
         awater,
         total_area)

class(all.pls.2022.area.f)

length(unique(all.pls.2022$GEOID)) == nrow(all.pls.2022)
range(nchar(all.pls.2022$GEOID))

length(unique(all.pls.2022.area.f$GEOID)) == nrow(all.pls.2022.area.f)
range(nchar(all.pls.2022.area.f$GEOID))

## fix Louisville GEOID ##
all.pls.2022.final.m <- stata.merge(all.pls.2022,
                                    all.pls.2022.area.f,
                                    "GEOID")

## check merge ##
table(all.pls.2022.final.m$merge.variable, useNA = "ifany")

## keep matches ##
## the one non-matching obs is for Louisville, which is duplicated in the area data ##
all.pls.2022.final <- all.pls.2022.final.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable,
         -ends_with("M"))

## attach land area values - cosubs ##
all.cosubs.2022.area.f <- all.cosubs.2022.area %>%
  select(GEOID, 
         ALAND,
         AWATER) %>%
  mutate(aland = as.numeric(ALAND),
         awater = as.numeric(AWATER),
         total_area = aland + awater) %>%
  select(GEOID,
         aland,
         awater,
         total_area)

class(all.cosubs.2022.area.f)

length(unique(all.cosubs.2022$GEOID)) == nrow(all.cosubs.2022)
range(nchar(all.cosubs.2022$GEOID))

length(unique(all.cosubs.2022.area.f$GEOID)) == nrow(all.cosubs.2022.area.f)
range(nchar(all.cosubs.2022.area.f$GEOID))


all.cosubs.2022.final.m <- stata.merge(all.cosubs.2022,
                                       all.cosubs.2022.area.f,
                                       "GEOID")

## check merge ##
table(all.cosubs.2022.final.m$merge.variable, useNA = "ifany")

## keep merge.variable values 1,3 ##
all.cosubs.2022.final <- all.cosubs.2022.final.m %>%
  filter(merge.variable %in% c(1,3)) %>%
  select(-merge.variable)

## reformat the cosub dataframe ## 

cosub.rf.2022 <- all.cosubs.2022.final %>%
  filter(!grepl('CCD', NAME)) %>%
  mutate(geoid.f = paste(substr(GEOID,1,2),
                         substr(GEOID,6,10),
                         sep="")) %>%
  select(-ends_with("M"))

## create merge variable on pls dataframe ##
all.pls.2022.final$geoid.f <- all.pls.2022.final$GEOID

## check for overlapping munis ## 

nrow(all.pls.2022.final) == length(unique(all.pls.2022.final$geoid.f))
class(all.pls.2022.final$geoid.f)
range(nchar(trim(all.pls.2022.final$geoid.f)))

nrow(cosub.rf.2022) == length(unique(cosub.rf.2022$geoid.f))
class(cosub.rf.2022$geoid.f)
range(nchar(trim(cosub.rf.2022$geoid.f)))

overlap.2022 <- stata.merge(all.pls.2022.final,
                            cosub.rf.2022,
                            "geoid.f")

## check overlap ## 
table(overlap.2022$merge.variable)

## create final muni dataframe ##

## for the non-matches, can just drop the variables of the non-matching obs ##

munis.fin1.2022 <- overlap.2022 %>%
  filter(merge.variable ==1) %>%
  select(-ends_with(".y"),
         -merge.variable)

names(munis.fin1.2022) <- sub("\\.x", "", names(munis.fin1.2022))

munis.fin2.2022 <- overlap.2022 %>%
  filter(merge.variable ==2) %>%
  select(-ends_with(".x"),
         -merge.variable)

names(munis.fin2.2022) <- sub("\\.y", "", names(munis.fin2.2022))

## the matches are more complicated ##
## the issue here is that place codes have no county code component ##
## so there will be duplicates when merging with reduced cousub codes ##
## the solution will be to deal with the duplicates manually ##

## collect cousubs with duplicate GEOIDs ##
munis.fin3.2022.v1 <- overlap.2022 %>%
  filter(merge.variable ==3) %>%
  select(-ends_with(".x"),
         -merge.variable) %>%
  unique()

## collect the corresponding place duplicates ##
munis.fin3.2022.v2 <- overlap.2022 %>%
  filter(merge.variable ==3) %>%
  select(-ends_with(".y"),
         -merge.variable) %>%
  unique()

## make the vars conform ##
names(munis.fin3.2022.v1) <- sub("\\.y", "", names(munis.fin3.2022.v1))
names(munis.fin3.2022.v2) <- sub("\\.x", "", names(munis.fin3.2022.v2))

## stack the duplicates ##
munis.fin3.2022.stacked <- rbind(munis.fin3.2022.v1,
                                 munis.fin3.2022.v2)

## now, fix the remaining duplicates ##
## case 1: same exact places/cousubs, just listed both as places and county subs ##
munis.fin3.gr1.2022 <- munis.fin3.2022.stacked %>% 
  group_by_at(vars(-c(GEOID, NAME))) %>% 
  filter(n() > 1) %>%
  summarize_all(list(first)) %>%
  ungroup() %>%
  select(geoid.f,
         GEOID,
         NAME,
         everything())

munis.fin3.gr1.2022 <- as.data.frame(munis.fin3.gr1.2022)

## case 2: different places/cousubs, but same reduced FIPS codes as other cousubs ##

munis.fin3.gr2.2022 <- munis.fin3.2022.stacked %>%
  filter(geoid.f %notin% munis.fin3.gr1.2022$geoid.f)

## case 3: places/cousubs that extend into multiple counties ##

fin3.2.s1a.2022 <- munis.fin3.gr2.2022 %>%
  filter(nchar(GEOID) == 10) %>%
  group_by(geoid.f) %>%
  summarize_if(is.numeric, list(sum)) %>%
  ungroup() 

fin3.2.s1b.2022 <- munis.fin3.gr2.2022 %>%
  filter(nchar(GEOID) == 10) %>%
  group_by(geoid.f) %>%
  summarize_if(is.character, list(first)) %>%
  ungroup() 

print("ROWS PRIOR TO JOIN")
nrow(fin3.2.s1a.2022)
nrow(fin3.2.s1b.2022)

fin3.2.s1.2022 <- inner_join(fin3.2.s1a.2022,
                             fin3.2.s1b.2022,
                             "geoid.f")

print("ROWS AFTER JOIN")
nrow(fin3.2.s1.2022)

fin3.2.s1.rf.2022 <- fin3.2.s1.2022 %>%
  select(geoid.f,
         GEOID,
         NAME, 
         everything())

## this file is already unique at the GEOID/geoid.f level, so no need for de-duplication ##
fin3.2.s2.2022 <- munis.fin3.gr2.2022 %>%
  filter(nchar(GEOID) < 10) %>%
  unique()

fin3.2.s3.2022 <- rbind(fin3.2.s1.rf.2022,
                        fin3.2.s2.2022)

length(unique(fin3.2.s3.2022$geoid.f)) == nrow(fin3.2.s3.2022)
length(unique(fin3.2.s3.2022$GEOID)) == nrow(fin3.2.s3.2022)

fin3.2.s4.2022 <- fin3.2.s3.2022 %>%
  group_by(geoid.f, pop_totalE) %>%
  summarize_all(list(last)) %>%
  ungroup()

## need to check these manually ##
fin3.mcheck.2022 <- fin3.2.s4.2022 %>%
  group_by(geoid.f) %>%
  summarize(n = n(), .groups = "drop") %>%
  filter(n>1)

## manual fixes ##
munis.fin1.2022.final <- munis.fin1.2022

munis.fin2.2022.final <- munis.fin2.2022 %>%
  filter(geoid.f %notin% c("0900000","1700000","1800000",
                           "1993925","1994045","2357936",
                           "2500000","2600000","2700000",
                           "3400000","3600000","3613035",
                           "3654523","3675011","3900000",
                           "5500000",
                           ## duplicated cities ##
                           "1722268","1990178","1990575",
                           "1990577","1990645","1990720",
                           "1990872","3190642","1991370",
                           "1992077","1992368","1992890",
                           "1993034","1993036","1993927",
                           "1994018","1994633","1994658",
                           "1994702","2298000","3190177",
                           "1990872","3190642","3192063",
                           "3193018","3193383","5190556",
                           "5190948","5193531","5194476",
                           "5194851","5196187","1990893",
                           "1991085","1994597","2490000",
                           "3192237","3790086","5190780",
                           "5195291","5193859","5194403",
                           "5195363","5195875","5196427",
                           ## in addition to the dups in munis.fin.2022, remove the following
                           ## Newman Grove, Tilden, Galion, Columbus, Vermillion, Westerville
                           "3192103","3193203","3918010",
                           "3929176","3983349"))

munis.fin3a.2022.final <- fin3.2.s4.2022 %>%
  filter(GEOID %notin% c("1803551876","1801184014","3114134230",
                         "3100348935","3915101420","3904918000",
                         "3917328014","3911729162","3901749840",
                         "3906171892","3910978470","3904379716",
                         "3904983342"))

munis.fin3a.2022.final <- as.data.frame(munis.fin3a.2022.final)

munis.fin3b.2022.final <- munis.fin3.gr1.2022


munis.fin3.2022.final <- rbind(munis.fin3a.2022.final,
                               munis.fin3b.2022.final)

class(munis.fin3.2022.final)

munis.fin3.2022.final <- as.data.frame(munis.fin3.2022.final)

## combine data for final data frame ## 
munis.fin.2022 <- rbind(munis.fin1.2022.final,
                        munis.fin2.2022.final,
                        munis.fin3.2022.final) %>%
  ## these are duplicates, have 0 pop in 2005-2009, or were incorporated after 2005 ##
  filter(geoid.f %notin% c("3400000","0968170","3716820",
                           "4536655","3941740","3975620",
                           "1243250","0908490","2937610",
                           "1852398","0621230","4839292",
                           "4870052","3563145","3790552",
                           "1324768","1377652","3812080",
                           "1221150","4817429","0159088",
                           "0812393","0812387","2739538",
                           "4865360","1879298","3536230",
                           "0477490","2819100","2810140",
                           "3791502","3503820","5587725",
                           "1233700","3701760","4630420",
                           "4662340","4665660","2940214",
                           "2956620","4915720","4922875",
                           "4940470","4947290","4984050",
                           "4950150") & 
         !grepl('precinct', NAME) & 
          pop_totalE >0 & 
         !(grepl("Township|township", NAME) & state == "AR") & 
         (!(grepl("District|district", NAME) & state == "MD") |
          GEOID == "2423025")) %>%
  rename(GEOID_full = GEOID,
         GEOID = geoid.f)

nrow(munis.fin.2022) == length(unique(munis.fin.2022$GEOID))
nrow(munis.fin.2022) == length(unique(munis.fin.2022$GEOID_full))

## fix issues in FIPS ## 
munis.fin.2022$GEOID[munis.fin.2022$GEOID == "1571550"] <- "1517000"
munis.fin.2022$GEOID_full[munis.fin.2022$GEOID_full == "1571550"] <- "1517000"
munis.fin.2022$GEOID[munis.fin.2022$GEOID == "3727324"] <- "3727320"
munis.fin.2022$GEOID_full[munis.fin.2022$GEOID_full == "3727324"] <- "3727320"
munis.fin.2022$GEOID[munis.fin.2022$GEOID == "4873493"] <- "4856516"
munis.fin.2022$GEOID_full[munis.fin.2022$GEOID_full == "4873493"] <- "4856516"
munis.fin.2022$GEOID[munis.fin.2022$GEOID == "5589550"] <- "5589575"
munis.fin.2022$GEOID_full[munis.fin.2022$GEOID_full == "5510189550"] <- "5510189575"
munis.fin.2022$GEOID[munis.fin.2022$GEOID == "2525172"] <- "2525100"
munis.fin.2022$GEOID_full[munis.fin.2022$GEOID_full == "2502125172"] <- "2502125100"
munis.fin.2022$GEOID[munis.fin.2022$GEOID == "2556000"] <- "2555955"
munis.fin.2022$GEOID_full[munis.fin.2022$GEOID_full == "2502156000"] <- "2502155955"
munis.fin.2022$GEOID[munis.fin.2022$GEOID == "2578972"] <- "2578865"
munis.fin.2022$GEOID_full[munis.fin.2022$GEOID_full == "2502178972"] <- "2502178865"
munis.fin.2022$GEOID[munis.fin.2022$GEOID == "2581005"] <- "2580930"
munis.fin.2022$GEOID_full[munis.fin.2022$GEOID_full == "2502581005"] <- "2502580930"
munis.fin.2022$GEOID[munis.fin.2022$GEOID == "2508130"] <- "2508085"
munis.fin.2022$GEOID_full[munis.fin.2022$GEOID_full == "2502308130"] <- "2502308085"
munis.fin.2022$GEOID[munis.fin.2022$GEOID == "2501370"] <- "2501325"
munis.fin.2022$GEOID_full[munis.fin.2022$GEOID_full == "2501501370"] <- "2501501325"

## check against census of governments ## 

govs.in <- read_xlsx("Govt_Units_2022_Final.xlsx")

## fix FIPS errors ##
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "08" & govs.in$FIPS_COUNTY == "047" & govs.in$FIPS_PLACE == "12900"] <- "12910"
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "13" & govs.in$FIPS_COUNTY == "059" & govs.in$FIPS_PLACE == "03436"] <- "03440"
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "20" & govs.in$FIPS_COUNTY == "071" & govs.in$FIPS_PLACE == "28410"] <- "28412"
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "25" & govs.in$FIPS_COUNTY == "011" & govs.in$FIPS_PLACE == "27100"] <- "27060"
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "30" & govs.in$FIPS_COUNTY == "093" & govs.in$FIPS_PLACE == "11390"] <- "11397"
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "47" & govs.in$FIPS_COUNTY == "037" & govs.in$FIPS_PLACE == "52004"] <- "52006"
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "25" & govs.in$FIPS_COUNTY == "021" & govs.in$FIPS_PLACE == "25172"] <- "25100"
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "25" & govs.in$FIPS_COUNTY == "021" & govs.in$FIPS_PLACE == "78972"] <- "78865"
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "25" & govs.in$FIPS_COUNTY == "025" & govs.in$FIPS_PLACE == "81005"] <- "80930"
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "21" & govs.in$FIPS_COUNTY == "111" & govs.in$FIPS_PLACE == "48003"] <- "48006"
govs.in$FIPS_PLACE[govs.in$FIPS_STATE == "25" & govs.in$FIPS_COUNTY == "015" & govs.in$FIPS_PLACE == "01370"] <- "01325"


table(govs.in$UNIT_TYPE, useNA = "ifany")

govs1 <- govs.in %>%
  filter(UNIT_TYPE %in% c("2 - MUNICIPAL",
                          "3 - TOWNSHIP")) %>%
  mutate(GEOID = paste0(FIPS_STATE,
                        FIPS_PLACE),
         GEOID_full = paste0(FIPS_STATE,
                             FIPS_COUNTY,
                             FIPS_PLACE)) %>%
  select(GEOID)

govs2 <- govs.in %>%
  filter(UNIT_TYPE %in% c("2 - MUNICIPAL",
                          "3 - TOWNSHIP")) %>%
  mutate(GEOID = paste0(FIPS_STATE,
                        FIPS_PLACE),
         GEOID_full = paste0(FIPS_STATE,
                             FIPS_COUNTY,
                             FIPS_PLACE)) %>%
  select(GEOID_full)

## restrict to valid muni governments ##

## first stage - munis not duplicated on GEOID ## 
munis.fin.2022.dups.check <- munis.fin.2022 %>%
  group_by(GEOID) %>%
  summarize(n= n(), .groups = "drop") %>%
  filter(n>1)

munis.fin.2022.nondups <- munis.fin.2022 %>%
  filter(GEOID %notin% munis.fin.2022.dups.check$GEOID)

length(unique(munis.fin.2022.nondups$GEOID)) == nrow(munis.fin.2022.nondups)
range(nchar(munis.fin.2022.nondups$GEOID))
class(munis.fin.2022.nondups$GEOID)

length(unique(govs1$GEOID)) == nrow(govs1)
range(nchar(govs1$GEOID))
class(govs1$GEOID)

munis.gov.m1 <- stata.merge(munis.fin.2022.nondups,
                            govs1,
                            "GEOID")

## check merge ## 
table(munis.gov.m1$merge.variable, useNA = "ifany")

## keep matches ##
## MTM 01/11/2025: manually confirmed the valid non-matching munis ##

munis.gov.keep1 <- munis.gov.m1 %>%
  filter(merge.variable == 3 | 
         GEOID %in% c("1271625",
                      "1232400",
                      "2412150",
                      "2446725",
                      "1522700",
                      "5364365",
                      "0260310",
                      "0286490",
                      "1517000")) %>%
  select(-merge.variable)

munis.gov.nm2 <- munis.gov.m1 %>%
  filter(merge.variable == 2) %>%
  select(GEOID)

## second stage - munis duplicated on GEOID ## 

munis.fin.2022.dups <- munis.fin.2022 %>%
  filter(GEOID %in% munis.fin.2022.dups.check$GEOID)

length(unique(munis.fin.2022.dups$GEOID_full)) == nrow(munis.fin.2022.dups)
range(nchar(munis.fin.2022.dups$GEOID_full))
class(munis.fin.2022.dups$GEOID_full)

length(unique(govs2$GEOID_full)) == nrow(govs2)
range(nchar(govs2$GEOID_full))
class(govs2$GEOID_full)

munis.gov.m2 <- stata.merge(munis.fin.2022.dups,
                            govs2,
                            "GEOID_full")

## check merge ## 
table(munis.gov.m2$merge.variable, useNA = "ifany")

## keep matches ##
## MTM 01/11/2025: manually confirmed the valid non-matching munis ##

munis.gov.keep2 <- munis.gov.m2 %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable)

munis.gov.nm1 <- munis.gov.m2 %>%
  filter(merge.variable == 1) 

## now combine the files ##
munis.gov <- rbind(munis.gov.keep1,
                   munis.gov.keep2)

## is file unique ##
length(unique(munis.gov$GEOID)) == nrow(munis.gov)
length(unique(munis.gov$GEOID_full)) == nrow(munis.gov)

nrow(munis.gov)

munis.fin.2022.ids <- munis.gov %>%
  select(GEOID,
         NAME)

dups.2022 <- munis.gov %>%
  group_by(GEOID) %>%
  summarize(n = n(), .groups = "drop") %>%
  filter(n>1)

## clean data ## 

munis.fin.cl.2022 <- munis.gov %>%
  rename_with(~str_remove(., 'E')) %>%
  rename(GEOID_full = GOID_full,
         GEOID = GOID,
         NAME = NAM) %>%
  mutate(rb = case_when(rb_den !=0 ~ rowSums(cbind(rb_num1,
                                                   rb_num2,
                                                   rb_num3,
                                                   rb_num4),na.rm=T)/rb_den,
                        rb_den == 0 ~ NA),
    per_18t34 = rowSums(cbind(male_18t19 , female_18t19 ,male_20 , female_20 ,
                              male_21 , female_21 , male_22t24 , female_22t24 , 
                              male_25t29 , female_25t29 , male_30t34 , female_30t34),na.rm=T)/age_den,
    per_35t64 = rowSums(cbind(male_35t39 , female_35t39 , male_40t44 , female_40t44 , 
                              male_45t49 , female_45t49 , male_50t54 , female_50t54 , 
                              male_55t59 , female_55t59 , male_60t61 , female_60t61 , 
                              male_62t64 , female_62t64),na.rm=T)/age_den,
    per_65over = rowSums(cbind(male_65t66 , female_65t66 , male_67t69 , female_67t69 , 
                               male_70t74 , female_70t74 , male_75t79 , female_75t79 , 
                               male_80t84 , female_80t84 , male_85over , female_85over),na.rm=T)/age_den,
    per_asian = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                          pop_total == 0 ~ 0),
    per_black = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                          pop_total == 0 ~ 0),
    per_latino = case_when(pop_total != 0 ~ pop_h/pop_total,
                           pop_total == 0 ~ 0),
    per_white = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                          pop_total == 0 ~ 0),
    per_aian = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                         pop_total == 0 ~ 0),
    per_other = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other,
                                                         pop_nh_nhpi,
                                                         pop_nh_multi),na.rm=T)/pop_total, 
                          pop_total == 0 ~ 0),
    log_asian = case_when(pop_nh_asian != 0 ~ log(1/per_asian),
                          pop_nh_asian == 0 ~ 0),
    log_black = case_when(pop_nh_black != 0 ~ log(1/per_black),
                          pop_nh_black == 0 ~ 0),
    log_latino = case_when(pop_h != 0 ~ log(1/per_latino),
                           pop_h == 0 ~ 0),
    log_white = case_when(pop_nh_white != 0 ~ log(1/per_white),
                          pop_nh_white == 0 ~ 0),
    log_aian = case_when(pop_nh_aian != 0 ~ log(1/per_aian),
                         pop_nh_aian == 0 ~ 0),
    log_other = case_when(rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) != 0 ~ log(1/per_other),
                          rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) == 0 ~ 0),
    entropy_er = per_asian*log_asian + 
                 per_black*log_black + 
                 per_latino*log_latino + 
                 per_white*log_white +
                 per_aian*log_aian +
                 per_other*log_other,
    hownrt = ifelse(hhs>0, oo_hu/hhs, NA),
    vacant_rt = ifelse(total_hu>0, total_vacant/total_hu,NA),
    per_child = ifelse(hhlds_child_den>0,hhlds_child_num/hhlds_child_den,NA),
    per_nomove = ifelse(hhld_move_den>0, hhld_nomove/hhld_move_den, NA),
    per_immigrant = ifelse(foreign_den > 0, foreign_born/foreign_den, NA),
    hhld_pov_rt = ifelse(hhld_pov_den > 0, hhld_pov_num/hhld_pov_den, NA),
    per_bm = ifelse(ed_den>0, rowSums(cbind(ed_ba,
                                            ed_ma,
                                            ed_pd,
                                            ed_phd),na.rm=T)/ed_den,NA),
    unem_rt = rowSums(cbind(mup_1619, fup_1619, mup_2021, fup_2021, mup_2224, fup_2224, 
                            mup_2529, fup_2529, mup_3034, fup_3034, mup_3544, fup_3544, 
                            mup_4554, fup_4554, mup_5559, fup_5559, mup_6061, fup_6061,  
                            mup_6264, fup_6264, mup_6569, fup_6569, mup_7074, fup_7074, 
                            mup_75u, fup_75u),na.rm=T)/rowSums(cbind(mlf_1619, flp_1619, mlf_2021, 
                                                                     flp_2021, mlf_2224, flp_2224, 
                                                                     mlf_2529, flp_2529, mlf_3034,
                                                                     flp_3034, mlf_3544, flp_3544,  
                                                                     mlf_4554, flp_4554, mlf_5559,
                                                                     flp_5559, mlf_6061, flp_6061, 
                                                                     mlf_6264, flp_6264, mlf_6569, 
                                                                     flp_6569, mlf_7074, flp_7074, 
                                                                     mlf_75u, flp_75u), na.rm=T),
    land_area_sqmeters = aland,
    land_area = aland/2589988,
    pop_density = pop_total/land_area,
    phi1 = hhld_inc1/hhld_inc,
    phi2 = hhld_inc2/hhld_inc,
    phi3 = hhld_inc3/hhld_inc,
    phi4 = hhld_inc4/hhld_inc,
    phi5 = hhld_inc5/hhld_inc,
    phi6 = hhld_inc6/hhld_inc,
    phi7 = hhld_inc7/hhld_inc,
    phi8 = hhld_inc8/hhld_inc,
    phi9 = hhld_inc9/hhld_inc,
    phi10 = hhld_inc10/hhld_inc,
    phi11 = hhld_inc11/hhld_inc,
    phi12 = hhld_inc12/hhld_inc,
    phi13 = hhld_inc13/hhld_inc,
    phi14 = hhld_inc14/hhld_inc,
    phi15 = hhld_inc15/hhld_inc,
    phi16 = hhld_inc16/hhld_inc) %>%
  select(GEOID,
         GEOID_full,
         NAME,
         pop_total,
         pop_density,
         land_area,
         per_18t34,
         per_35t64,
         per_65over,
         median_age,
         per_asian,
         per_black,
         per_latino,
         per_white,
         entropy_er,
         per_child,
         per_nomove,
         per_immigrant,
         oo_hu,
         ro_hu,
         hownrt,
         rb,
         median_gross_rent,
         median_prop_value,
         vacant_rt,
         median_yr_str,
         median_hhld_inc,
         starts_with("phi"),
         per_bm,
         hhld_pov_rt,
         unem_rt,
         gini) %>%
  rename_at(vars(-GEOID,-GEOID_full,-NAME),function(x) paste0(x,"_2022"))

## final munis ## 

munis.fin.fmf.2022 <- munis.fin.cl.2022 %>%
  filter((!grepl("CDP", NAME)) |
         (grepl("CDP", NAME) & substr(GEOID,1,2) == "15") |
         (GEOID %in% c("0286490", "0260310","2365725",
                       "5364365", "2412150","2446725",
                       "5159091108","1232400","1271625")))

## fix GEOIDs ##
munis.fin.fmf.2022$GEOID[munis.fin.fmf.2022$GEOID == "1349008"] <- "1349000"
munis.fin.fmf.2022$GEOID_full[munis.fin.fmf.2022$GEOID_full == "1349008"] <- "1349000"
munis.fin.fmf.2022$GEOID[munis.fin.fmf.2022$GEOID == "2525172"] <- "2525100"
munis.fin.fmf.2022$GEOID_full[munis.fin.fmf.2022$GEOID_full == "2502125172"] <- "2502125100"
munis.fin.fmf.2022$GEOID[munis.fin.fmf.2022$GEOID == "3983090"] <- "3983111"
munis.fin.fmf.2022$GEOID_full[munis.fin.fmf.2022$GEOID_full == "3911383090"] <- "3911383111"
munis.fin.fmf.2022$GEOID[munis.fin.fmf.2022$GEOID == "2365725"] <- "2365760"
munis.fin.fmf.2022$GEOID_full[munis.fin.fmf.2022$GEOID_full == "2303165725"] <- "2303165760"

## unique? ##
nrow(munis.fin.fmf.2022) == length(unique(munis.fin.fmf.2022$GEOID))

## apply CT fix ## 

load(paste0(output_path,
            "003_ct_cs_t_cnty_xwalk.Rda"))

## create merge variable ##
ct.cs <- ct.cs %>%
  rename(GEOID_full = GEOID)

sum(substr(munis.fin.fmf.2022$GEOID,1,2)=="09")

munis.fin.fmf.2022.noct <- munis.fin.fmf.2022 %>%
  filter(substr(GEOID,1,2) != "09")

munis.fin.fmf.2022.ct <- munis.fin.fmf.2022 %>%
  filter(substr(GEOID,1,2) == "09")

range(nchar(ct.cs$GEOID_full))
range(nchar(munis.fin.fmf.2022.ct$GEOID_full))

length(unique(ct.cs$GEOID_full)) == nrow(ct.cs)
length(unique(munis.fin.fmf.2022.ct$GEOID_full)) == nrow(munis.fin.fmf.2022.ct)

munis.fin.fmf.2022.ct.rf.m <- stata.merge(munis.fin.fmf.2022.ct,
                                          ct.cs,
                                          "GEOID_full")

## check merge ##
table(munis.fin.fmf.2022.ct.rf.m$merge.variable, useNA = "ifany")

## these 20 merge.variable == 2 obs are already accounted for in the merge.variable == 1 obs ##
## MTM conducted manual checks on 1/12/25 confirming that this is a non-issue ##

ct.nm2 <- munis.fin.fmf.2022.ct.rf.m %>% 
  filter(merge.variable == 2) %>%
  select(GEOID_full,
         cousub_name_new)

ct.nm1 <- munis.fin.fmf.2022.ct.rf.m %>% 
  filter(merge.variable == 1) %>%
  select(GEOID_full,
         NAME)

## keep 1,3 obs ## 
munis.fin.fmf.2022.ct.rf <- munis.fin.fmf.2022.ct.rf.m %>%
  filter(merge.variable %in% c(1,3)) %>%
  mutate(GEOID_full_rf = case_when(merge.variable == 3 ~ cousub_geoid_old,
                                   merge.variable == 1 ~ GEOID_full)) %>%
  select(-GEOID_full,
         -sffips_old,
         -cfips_old,
         -county_name_old,
         -county_name_new,
         -csfp_new,
         -cfips_new,
         -cousub_geoid_old,
         -cousub_name_new,
         -cousubns,
         -cousub_lsad,
         -cousub_funcstat,
         -cousub_classfp,
         -merge.variable) %>%
  rename(GEOID_full = GEOID_full_rf) %>%
  select(GEOID,
         GEOID_full,
         everything())

## same number of obs? ##
nrow(munis.fin.fmf.2022.ct) == nrow(munis.fin.fmf.2022.ct.rf)

## combine the munis ## 
munis.final.2022 <- rbind(munis.fin.fmf.2022.ct.rf,
                          munis.fin.fmf.2022.noct)

## same number of obs? ##
nrow(munis.final.2022) == nrow(munis.fin.fmf.2022)

## still unique? ##
length(unique(munis.final.2022$GEOID)) == nrow(munis.final.2022)
length(unique(munis.final.2022$GEOID_full)) == nrow(munis.final.2022)

## double check the CDPs ##
cdps.2022 <- filter(munis.final.2022, grepl("CDP", NAME))

## check for tricky munis ##
muni.names.2022 <- munis.final.2022 %>%
  select(GEOID,
         GEOID_full,
         NAME)

## check data ##
summary(munis.final.2022)

## how many are metro munis? ## 

pl.metro <- tract.to.pl.metro %>%
  filter(!grepl("CDP", placenm) | GEOID_muni %in% c("1271625",
                                                    "1232400",
                                                    "2412150",
                                                    "2446725",
                                                    "1522700",
                                                    "5364365",
                                                    "1517000")) %>%
  select(GEOID_muni) %>%
  unique() %>%
  rename(GEOID = GEOID_muni)

range(nchar(munis.final.2022$GEOID))
range(nchar(pl.metro$GEOID))

length(unique(munis.final.2022$GEOID)) == nrow(munis.final.2022)
length(unique(pl.metro$GEOID)) == nrow(pl.metro)

ptm.2022 <- stata.merge(munis.final.2022,
                        pl.metro,
                        "GEOID")

table(ptm.2022$merge.variable, useNA = "ifany")

ptm.2022.keep <- ptm.2022 %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  select(GEOID) %>%
  unique()


cs.metro <- tract.to.cs.metro %>%
  select(-GEOID) %>%
  filter(!grepl("CCD", cousubnm)) %>%
  mutate(GEOID = paste0(`FIPS State Code`,
                        cousubfp),
         GEOID_full = paste0(`FIPS State Code`,
                             `FIPS County Code`,
                             cousubfp)) %>%
  select(GEOID,
         GEOID_full) %>%
  unique()

cs.fm.2022 <- ptm.2022 %>%
  filter(merge.variable == 1) %>%
  select(-merge.variable)

range(nchar(cs.fm.2022$GEOID))
range(nchar(cs.metro$GEOID))

length(unique(cs.fm.2022$GEOID)) == nrow(cs.fm.2022)
length(unique(cs.metro$GEOID)) == nrow(cs.metro)

ctm.2022 <- stata.merge(cs.fm.2022,
                        cs.metro,
                        "GEOID")

table(ctm.2022$merge.variable, useNA = "ifany")

ctm.2022.keep <- ctm.2022 %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  select(GEOID) %>%
  unique()

mtm.2022 <- rbind(ptm.2022.keep,
                  ctm.2022.keep)

nrow(mtm.2022)
length(unique(mtm.2022$GEOID)) ==  nrow(mtm.2022)

## save data for all munis ##
save(munis.final.2022,
     file = paste(output_path,
                  "004_allmunis_2022.Rda",
                  sep=""))

#####################
## now 2022 tracts ##
#####################

## combine all tract the data ## 
## also need to fix CT issue with 2022 Census data ##


ct.xwalk.fix.in <- read.csv("2022tractcrosswalk.csv",
                            header = T,
                            stringsAsFactors = F)

ct.xwalk.fix <- ct.xwalk.fix.in %>%
  mutate(GEOID20 = str_pad(tract_fips_2020, 11, "left", "0"),
         GEOID22 = str_pad(Tract_fips_2022, 11, "left", "0"))

range(nchar(trim(ct.xwalk.fix$GEOID20)))
range(nchar(trim(ct.xwalk.fix$GEOID22)))

alltracts.2022 <- bind_rows(state.tracts.2022) %>%
  rename(GEOID22 = GEOID)

length(unique(alltracts.2022$GEOID22)) == nrow(alltracts.2022)
length(unique(ct.xwalk.fix$GEOID22)) == nrow(ct.xwalk.fix)

alltracts.2022.rf.m <- stata.merge(alltracts.2022,
                                   ct.xwalk.fix,
                                   "GEOID22")

## check merge ##
table(alltracts.2022.rf.m$merge.variable, useNA = "ifany")

## apply fix ## 

alltracts.2022.rf <- alltracts.2022.rf.m %>%
  mutate(GEOID = case_when(merge.variable == 3 ~ GEOID20,
                           merge.variable == 1 ~ GEOID22)) %>%
  select(-merge.variable,
         -GEOID20,
         -GEOID22) %>%
  select(GEOID, 
         everything())

nrow(alltracts.2022)
nrow(alltracts.2022.rf)

## update GEOIDs to reflect 2010 tract boundaries ##

trtxwlk.2020.in <- read.table("trtxwalk20102020.txt",
                              sep = "|",
                              header = T,
                              stringsAsFactors= F)

trtxwlk.2020 <- trtxwlk.2020.in %>%
  mutate(GEOID10 = str_pad(trim(GEOID_TRACT_10), 11, "left", "0"),
         GEOID20 = str_pad(trim(GEOID_TRACT_20), 11, "left", "0"),
         GEOID = GEOID20,
         landarea10 = as.numeric(AREALAND_TRACT_10),
         waterarea10 = as.numeric(AREAWATER_TRACT_10),
         landarea20 = as.numeric(AREALAND_TRACT_20),
         waterarea20 = as.numeric(AREAWATER_TRACT_20),
         landport = as.numeric(AREALAND_PART),
         waterport = as.numeric(AREAWATER_PART),
         totarea10 = landarea10 + waterarea10,
         totarea20 = landarea20 + waterarea20,
         portareatot = landport + waterport,
         wt = landport/landarea20) %>%
  select(GEOID10,
         GEOID20,
         GEOID,
         wt)

range(nchar(trim(trtxwlk.2020$GEOID10)))
range(nchar(trim(trtxwlk.2020$GEOID20)))
range(nchar(trim(trtxwlk.2020$GEOID)))

## pre-merge checks ##

class(alltracts.2022.rf$GEOID)
range(nchar(trim(alltracts.2022.rf$GEOID)))
nrow(alltracts.2022.rf) == length(unique(alltracts.2022.rf$GEOID))

class(trtxwlk.2020$GEOID)
range(nchar(trim(trtxwlk.2020$GEOID)))

## merge the files ## 

alltracts.2022.updated <- stata.merge(alltracts.2022.rf,
                                      trtxwlk.2020,
                                      "GEOID")

## diagnose merge ## 
## merge.variable == 1 obs all have 0 pop ##
table(alltracts.2022.updated$merge.variable, useNA = "ifany")

## keep only the matches and obs with non-zero weights ##

alltracts.2022.match <- alltracts.2022.updated %>%
  filter(merge.variable == 3 &
         wt != 0) %>%
  select(-merge.variable,
         -ends_with("M")) %>%
  rename_with(~str_remove(., 'E')) %>%
  rename(GEOID = GOID,
         GEOID10 = GOID10,
         GEOID20 = GOID20)

## change the GEOID ##
## some 2020 tract IDs are duplicated because of    ##
## tracts that changed from 2010 to 2020; these     ##
## tracts will be condensed into one observation    ##
## via a weighted average, with weights from Census ##

## start with variables that can be added ## 

alltracts.2022.pr1a <- alltracts.2022.match %>%
  select(GEOID,
         GEOID10,
         GEOID20, 
         state,
         starts_with("pop"),
         ro_hhlds,
         starts_with("rb"),
         hhld_pov_den,
         hhld_pov_num,
         gq_pop,
         wt) %>%
  mutate_at(vars(pop_total:gq_pop),
            .funs = funs(. * wt))

all.tracts.2022.pr2a <- alltracts.2022.pr1a %>% 
  group_by(GEOID10) %>%
  summarize_at(vars(pop_total:gq_pop),
               funs(sum(., na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID = GEOID10)

## now more complicated variables ## 

all.tracts.2022.pr2b <- alltracts.2022.match %>%
  select(GEOID,
         GEOID10,
         GEOID20,
         state, 
         median_gross_rent,
         median_prop_value,
         wt) %>%
  group_by(GEOID10) %>%
  summarize_at(vars(median_gross_rent:median_prop_value),
               funs(weighted.mean(., wt, na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID = GEOID10)

## check obs ## 
print("ROWS PRIOR TO JOIN")
nrow(all.tracts.2022.pr2a)
nrow(all.tracts.2022.pr2b)

all.tracts.2022 <- all.tracts.2022.pr2a %>%
  inner_join(all.tracts.2022.pr2b, "GEOID") %>%
  mutate(FIPS = substr(GEOID,1,5),
         gq_per = ifelse(pop_total > 0, gq_pop/pop_total, NA),
         per_asian = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                               pop_total == 0 ~ 0),
         per_black = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                               pop_total == 0 ~ 0),
         per_latino = case_when(pop_total != 0 ~ pop_h/pop_total,
                                pop_total == 0 ~ 0),
         per_white = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                               pop_total == 0 ~ 0),
         per_aian = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                              pop_total == 0 ~ 0),
         per_other = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T)/pop_total, 
                               pop_total == 0 ~ 0)) %>%
  filter(pop_total >= 500 & ro_hhlds >= 10 & gq_per <= 0.5) 

print("ROWS AFTER JOIN")
nrow(all.tracts.2022)

summary(all.tracts.2022)
sd(all.tracts.2022$median_gross_rent, na.rm=T)

## what is the 2022 poverty rate for metro tracts? ##

length(unique(all.tracts.2022$FIPS)) == nrow(all.tracts.2022)
range(nchar(all.tracts.2022$FIPS))

length(unique(msa.del.2020.rd$FIPS)) == nrow(msa.del.2020.rd)
range(nchar(msa.del.2020.rd$FIPS))

metro.tracts.2022.m <- stata.merge(all.tracts.2022, 
                                   msa.del.2020.rd, 
                                   "FIPS")

## check merge ##
table(metro.tracts.2022.m$merge.variable, useNA = "ifany")

metro.tracts.2022 <- metro.tracts.2022.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(hhld_pov_rt = ifelse(hhld_pov_den > 0, hhld_pov_num/hhld_pov_den, NA))

summary(metro.tracts.2022)

####################################
## now, process 2022 outcome data ##
####################################

## above average poverty tracts ##

npov.tracts.2022 <- metro.tracts.2022 %>%
  mutate(rb = case_when(rb_den !=0 ~ rowSums(cbind(rb_num1,
                                                   rb_num2,
                                                   rb_num3,
                                                   rb_num4),na.rm=T)/rb_den,
                        rb_den == 0 ~ NA)) %>%
  select(GEOID,
         pop_total,
         ro_hhlds,
         rb_num1,
         rb_num2,
         rb_num3,
         rb_num4,
         rb_den,
         rb,
         median_gross_rent,
         median_prop_value,
         hhld_pov_rt) %>%
  filter(hhld_pov_rt >= 0.13)

## get high poverty tracts ##

hpov.tracts.2022 <- metro.tracts.2022 %>%
  mutate(rb = case_when(rb_den !=0 ~ rowSums(cbind(rb_num1,
                                                   rb_num2,
                                                   rb_num3,
                                                   rb_num4),na.rm=T)/rb_den,
                        rb_den == 0 ~ NA)) %>%
  select(GEOID,
         pop_total,
         ro_hhlds,
         rb_num1,
         rb_num2,
         rb_num3,
         rb_num4,
         rb_den,
         rb,
         median_gross_rent,
         median_prop_value,
         hhld_pov_rt) %>%
  filter(hhld_pov_rt >= 0.4)

#########################################
## 2005-2009 tracts and municipalities ##
#########################################

v2009 <- load_variables(2009, "acs5", cache=T)

## initialize lists to store data frames ##
state.tracts.2009 <- list()
state.places.2009 <- list() 
state.cosubs.2009 <- list()

## initialize counter ## 
state.counter <- 1

## start the loop ##
for (st in states){
  
  ## tracts ##
  tr2009 <- get_acs(geography = "tract", 
                    variables = c(pop_total = "B01003_001",
                                  ro_hhlds = "B25003_003",
                                  rb_den = "B25070_001",
                                  rb_num1 = "B25070_007",
                                  rb_num2 = "B25070_008",
                                  rb_num3 = "B25070_009",
                                  rb_num4 = "B25070_010",
                                  median_gross_rent = "B25064_001",
                                  median_prop_value = "B25077_001",
                                  hhld_pov_den = "B17017_001",
                                  hhld_pov_num = "B17017_002",
                                  pop_nh_white = "B03002_003",
                                  pop_nh_black = "B03002_004",
                                  pop_nh_aian = "B03002_005",
                                  pop_nh_asian = "B03002_006",
                                  pop_nh_nhpi = "B03002_007",
                                  pop_nh_other = "B03002_008",
                                  pop_nh_multi = "B03002_009",
                                  pop_nh_multi_oth = "B03002_010",
                                  pop_nh_multi_noth ="B03002_011",
                                  pop_h = "B03002_012",
                                  gq = "B26001_001"),
                    state = st, 
                    survey = "acs5",
                    output = "wide",
                    year = 2009)
  
  tr2009$state <- st
  
  ## store the data frame in the list ## 
  state.tracts.2009[[state.counter]] <- tr2009

  ## get data for places ##
  pl2009 <- get_acs(geography = "place",
                     variables = c(pop_total = "B01003_001",
                                   race_den = "B02001_001",
                                   pop_white = "B02001_002",
                                   pop_black = "B02001_003",
                                   pop_aian = "B02001_004",
                                   pop_asian = "B02001_005",
                                   pop_nhpi = "B02001_006",
                                   pop_other = "B02001_007",
                                   pop_multi = "B02001_008",
                                   pop_multi_oth = "B02001_009",
                                   pop_multi_noth = "B02001_010",
                                   hisp_race_den = "B03002_001",
                                   pop_nh = "B03002_002",
                                   pop_nh_white = "B03002_003",
                                   pop_nh_black = "B03002_004",
                                   pop_nh_aian = "B03002_005",
                                   pop_nh_asian = "B03002_006",
                                   pop_nh_nhpi = "B03002_007",
                                   pop_nh_other = "B03002_008",
                                   pop_nh_multi = "B03002_009",
                                   pop_nh_multi_oth = "B03002_010",
                                   pop_nh_multi_noth ="B03002_011",
                                   pop_h = "B03002_012",
                                   pop_h_white = "B03002_013",
                                   pop_h_black = "B03002_014",
                                   pop_h_aian = "B03002_015",
                                   pop_h_asian = "B03002_016",
                                   pop_h_nhpi = "B03002_017",
                                   pop_h_other = "B03002_018",
                                   pop_h_multi = "B03002_019",
                                   pop_h_multi_oth = "B03002_020",
                                   pop_h_multi_noth = "B03002_021",
                                   age_den = "B01001_001",
                                   tot_male = "B01001_002",
                                   male_lt5 = "B01001_003",
                                   male_5t9 = "B01001_004",
                                   male_10t14 = "B01001_005",
                                   male_15t17 = "B01001_006",
                                   male_18t19 = "B01001_007",
                                   male_20 = "B01001_008",
                                   male_21 = "B01001_009",
                                   male_22t24 = "B01001_010",
                                   male_25t29 = "B01001_011",
                                   male_30t34 = "B01001_012",
                                   male_35t39 = "B01001_013",
                                   male_40t44 = "B01001_014",
                                   male_45t49 = "B01001_015",
                                   male_50t54 = "B01001_016",
                                   male_55t59 = "B01001_017",
                                   male_60t61 = "B01001_018",
                                   male_62t64 = "B01001_019",
                                   male_65t66 = "B01001_020",
                                   male_67t69 = "B01001_021",
                                   male_70t74 = "B01001_022",
                                   male_75t79 = "B01001_023",
                                   male_80t84 = "B01001_024",
                                   male_85over = "B01001_025",
                                   tot_female = "B01001_026",
                                   female_lt5 = "B01001_027",
                                   female_5t9 = "B01001_028",
                                   female_10t14 = "B01001_029",
                                   female_15t17 = "B01001_030",
                                   female_18t19 = "B01001_031",
                                   female_20 = "B01001_032",
                                   female_21 = "B01001_033",
                                   female_22t24 = "B01001_034",
                                   female_25t29 = "B01001_035",
                                   female_30t34 = "B01001_036",
                                   female_35t39 = "B01001_037",
                                   female_40t44 = "B01001_038",
                                   female_45t49 = "B01001_039",
                                   female_50t54 = "B01001_040",
                                   female_55t59 = "B01001_041",
                                   female_60t61 = "B01001_042",
                                   female_62t64 = "B01001_043",
                                   female_65t66 = "B01001_044",
                                   female_67t69 = "B01001_045",
                                   female_70t74 = "B01001_046",
                                   female_75t79 = "B01001_047",
                                   female_80t84 = "B01001_048",
                                   female_85over = "B01001_049",
                                   median_age = "B01002_001",
                                   hhlds_child_den = "B11005_001",
                                   hhlds_child_num = "B11005_002",
                                   hhld_move_den = "B07003_001",
                                   hhld_nomove = "B07003_004",
                                   foreign_den = "B05002_001",
                                   foreign_born = "B05002_013",
                                   hhs = "B25003_001",
                                   oo_hu = "B25003_002",
                                   ro_hu = "B25003_003",
                                   rb_den = "B25070_001",
                                   rb_num1 = "B25070_007",
                                   rb_num2 = "B25070_008",
                                   rb_num3 = "B25070_009",
                                   rb_num4 = "B25070_010",
                                   median_gross_rent = "B25064_001",
                                   median_prop_value = "B25077_001",
                                   median_yr_str = "B25035_001",
                                   total_hu = "B25002_001",
                                   total_vacant = "B25002_003",
                                   median_hhld_inc = "B19013_001",
                                   hhld_inc = "B19001_001",
                                   hhld_inc1 = "B19001_002",
                                   hhld_inc2 = "B19001_003",
                                   hhld_inc3 = "B19001_004",
                                   hhld_inc4 = "B19001_005",
                                   hhld_inc5 = "B19001_006",
                                   hhld_inc6 = "B19001_007",
                                   hhld_inc7 = "B19001_008",
                                   hhld_inc8 = "B19001_009",
                                   hhld_inc9 = "B19001_010",
                                   hhld_inc10 = "B19001_011",
                                   hhld_inc11 = "B19001_012",
                                   hhld_inc12 = "B19001_013",
                                   hhld_inc13 = "B19001_014",
                                   hhld_inc14 = "B19001_015",
                                   hhld_inc15 = "B19001_016",
                                   hhld_inc16 = "B19001_017",
                                   hhld_pov_den = "B17017_001",
                                   hhld_pov_num = "B17017_002",
                                   ed_den = "B15002_001",
                                   ed_male_den = "B15002_002",
                                   ed_male_none = "B15002_003",
                                   ed_male_nt4 = "B15002_004",
                                   ed_male_5t6 = "B15002_005",
                                   ed_male_7t8 = "B15002_006",
                                   ed_male_9 = "B15002_007",
                                   ed_male_10 = "B15002_008",
                                   ed_male_11 = "B15002_009",
                                   ed_male_12_nd = "B15002_010",
                                   ed_male_12_d = "B15002_011",
                                   ed_male_sc1l = "B15002_012",
                                   ed_male_sc1m = "B15002_013",
                                   ed_male_as = "B15002_014",
                                   ed_male_ba = "B15002_015",
                                   ed_male_ma = "B15002_016",
                                   ed_male_prof = "B15002_017",
                                   ed_male_phd = "B15002_018",
                                   ed_female_den = "B15002_019",
                                   ed_female_none = "B15002_020",
                                   ed_female_nt4 = "B15002_021",
                                   ed_female_5t6 = "B15002_022",
                                   ed_female_7t8 = "B15002_023",
                                   ed_female_9 = "B15002_024",
                                   ed_female_10 = "B15002_025",
                                   ed_female_11 = "B15002_026",
                                   ed_female_12_nd = "B15002_027",
                                   ed_female_12_d = "B15002_028",
                                   ed_female_sc1l = "B15002_029",
                                   ed_female_sc1m = "B15002_030",
                                   ed_female_as = "B15002_031",
                                   ed_female_ba = "B15002_032",
                                   ed_female_ma = "B15002_033",
                                   ed_female_prof = "B15002_034",
                                   ed_female_phd = "B15002_035",
                                   #gini = "B19083_001",
                                   mlf_1619 = "B23001_006",
                                   mup_1619 = "B23001_008",
                                   mlf_2021 = "B23001_013",
                                   mup_2021 = "B23001_015",
                                   mlf_2224 = "B23001_020",
                                   mup_2224 = "B23001_022",
                                   mlf_2529 = "B23001_027",
                                   mup_2529 = "B23001_029",
                                   mlf_3034 = "B23001_034",
                                   mup_3034 = "B23001_036",
                                   mlf_3544 = "B23001_041",
                                   mup_3544 = "B23001_043",
                                   mlf_4554 = "B23001_048",
                                   mup_4554 = "B23001_050",
                                   mlf_5559 = "B23001_055",
                                   mup_5559 = "B23001_057",
                                   mlf_6061 = "B23001_062",
                                   mup_6061 = "B23001_064",
                                   mlf_6264 = "B23001_069",
                                   mup_6264 = "B23001_071",
                                   mlf_6569 = "B23001_074",
                                   mup_6569 = "B23001_076",
                                   mlf_7074 = "B23001_079",
                                   mup_7074 = "B23001_081",
                                   mlf_75u = "B23001_084",
                                   mup_75u = "B23001_086",
                                   flp_1619 = "B23001_092",
                                   fup_1619 = "B23001_094",
                                   flp_2021 = "B23001_099",
                                   fup_2021 = "B23001_101",
                                   flp_2224 = "B23001_106",
                                   fup_2224 = "B23001_108",
                                   flp_2529 = "B23001_113",
                                   fup_2529 = "B23001_115",
                                   flp_3034 = "B23001_120",
                                   fup_3034 = "B23001_122",
                                   flp_3544 = "B23001_127",
                                   fup_3544 = "B23001_129",
                                   flp_4554 = "B23001_134",
                                   fup_4554 = "B23001_136",
                                   flp_5559 = "B23001_141",
                                   fup_5559 = "B23001_143",
                                   flp_6061 = "B23001_148",
                                   fup_6061 = "B23001_150",
                                   flp_6264 = "B23001_155",
                                   fup_6264 = "B23001_157",
                                   flp_6569 = "B23001_160",
                                   fup_6569 = "B23001_162",
                                   flp_7074 = "B23001_165",
                                   fup_7074 = "B23001_167",
                                   flp_75u = "B23001_170",
                                   fup_75u = "B23001_172"),
                     state = st,
                     survey = "acs5",
                     output = "wide",
                     year = 2009)
  
  ## add state identifying variable ##
  pl2009$state <- st
  
  ## store the data frame in the list ## 
  state.places.2009[[state.counter]] <- pl2009
  
  ## now, county subdivisions ##
  
  cosub2009 <- get_acs(geography = "county subdivision", 
                       variables = c(pop_total = "B01003_001",
                                     race_den = "B02001_001",
                                     pop_white = "B02001_002",
                                     pop_black = "B02001_003",
                                     pop_aian = "B02001_004",
                                     pop_asian = "B02001_005",
                                     pop_nhpi = "B02001_006",
                                     pop_other = "B02001_007",
                                     pop_multi = "B02001_008",
                                     pop_multi_oth = "B02001_009",
                                     pop_multi_noth = "B02001_010",
                                     hisp_race_den = "B03002_001",
                                     pop_nh = "B03002_002",
                                     pop_nh_white = "B03002_003",
                                     pop_nh_black = "B03002_004",
                                     pop_nh_aian = "B03002_005",
                                     pop_nh_asian = "B03002_006",
                                     pop_nh_nhpi = "B03002_007",
                                     pop_nh_other = "B03002_008",
                                     pop_nh_multi = "B03002_009",
                                     pop_nh_multi_oth = "B03002_010",
                                     pop_nh_multi_noth ="B03002_011",
                                     pop_h = "B03002_012",
                                     pop_h_white = "B03002_013",
                                     pop_h_black = "B03002_014",
                                     pop_h_aian = "B03002_015",
                                     pop_h_asian = "B03002_016",
                                     pop_h_nhpi = "B03002_017",
                                     pop_h_other = "B03002_018",
                                     pop_h_multi = "B03002_019",
                                     pop_h_multi_oth = "B03002_020",
                                     pop_h_multi_noth = "B03002_021",
                                     age_den = "B01001_001",
                                     tot_male = "B01001_002",
                                     male_lt5 = "B01001_003",
                                     male_5t9 = "B01001_004",
                                     male_10t14 = "B01001_005",
                                     male_15t17 = "B01001_006",
                                     male_18t19 = "B01001_007",
                                     male_20 = "B01001_008",
                                     male_21 = "B01001_009",
                                     male_22t24 = "B01001_010",
                                     male_25t29 = "B01001_011",
                                     male_30t34 = "B01001_012",
                                     male_35t39 = "B01001_013",
                                     male_40t44 = "B01001_014",
                                     male_45t49 = "B01001_015",
                                     male_50t54 = "B01001_016",
                                     male_55t59 = "B01001_017",
                                     male_60t61 = "B01001_018",
                                     male_62t64 = "B01001_019",
                                     male_65t66 = "B01001_020",
                                     male_67t69 = "B01001_021",
                                     male_70t74 = "B01001_022",
                                     male_75t79 = "B01001_023",
                                     male_80t84 = "B01001_024",
                                     male_85over = "B01001_025",
                                     tot_female = "B01001_026",
                                     female_lt5 = "B01001_027",
                                     female_5t9 = "B01001_028",
                                     female_10t14 = "B01001_029",
                                     female_15t17 = "B01001_030",
                                     female_18t19 = "B01001_031",
                                     female_20 = "B01001_032",
                                     female_21 = "B01001_033",
                                     female_22t24 = "B01001_034",
                                     female_25t29 = "B01001_035",
                                     female_30t34 = "B01001_036",
                                     female_35t39 = "B01001_037",
                                     female_40t44 = "B01001_038",
                                     female_45t49 = "B01001_039",
                                     female_50t54 = "B01001_040",
                                     female_55t59 = "B01001_041",
                                     female_60t61 = "B01001_042",
                                     female_62t64 = "B01001_043",
                                     female_65t66 = "B01001_044",
                                     female_67t69 = "B01001_045",
                                     female_70t74 = "B01001_046",
                                     female_75t79 = "B01001_047",
                                     female_80t84 = "B01001_048",
                                     female_85over = "B01001_049",
                                     median_age = "B01002_001",
                                     hhlds_child_den = "B11005_001",
                                     hhlds_child_num = "B11005_002",
                                     hhld_move_den = "B07003_001",
                                     hhld_nomove = "B07003_004",
                                     foreign_den = "B05002_001",
                                     foreign_born = "B05002_013",
                                     hhs = "B25003_001",
                                     oo_hu = "B25003_002",
                                     ro_hu = "B25003_003",
                                     rb_den = "B25070_001",
                                     rb_num1 = "B25070_007",
                                     rb_num2 = "B25070_008",
                                     rb_num3 = "B25070_009",
                                     rb_num4 = "B25070_010",
                                     median_gross_rent = "B25064_001",
                                     median_prop_value = "B25077_001",
                                     median_yr_str = "B25035_001",
                                     total_hu = "B25002_001",
                                     total_vacant = "B25002_003",
                                     median_hhld_inc = "B19013_001",
                                     hhld_inc = "B19001_001",
                                     hhld_inc1 = "B19001_002",
                                     hhld_inc2 = "B19001_003",
                                     hhld_inc3 = "B19001_004",
                                     hhld_inc4 = "B19001_005",
                                     hhld_inc5 = "B19001_006",
                                     hhld_inc6 = "B19001_007",
                                     hhld_inc7 = "B19001_008",
                                     hhld_inc8 = "B19001_009",
                                     hhld_inc9 = "B19001_010",
                                     hhld_inc10 = "B19001_011",
                                     hhld_inc11 = "B19001_012",
                                     hhld_inc12 = "B19001_013",
                                     hhld_inc13 = "B19001_014",
                                     hhld_inc14 = "B19001_015",
                                     hhld_inc15 = "B19001_016",
                                     hhld_inc16 = "B19001_017",
                                     hhld_pov_den = "B17017_001",
                                     hhld_pov_num = "B17017_002",
                                     ed_den = "B15002_001",
                                     ed_male_den = "B15002_002",
                                     ed_male_none = "B15002_003",
                                     ed_male_nt4 = "B15002_004",
                                     ed_male_5t6 = "B15002_005",
                                     ed_male_7t8 = "B15002_006",
                                     ed_male_9 = "B15002_007",
                                     ed_male_10 = "B15002_008",
                                     ed_male_11 = "B15002_009",
                                     ed_male_12_nd = "B15002_010",
                                     ed_male_12_d = "B15002_011",
                                     ed_male_sc1l = "B15002_012",
                                     ed_male_sc1m = "B15002_013",
                                     ed_male_as = "B15002_014",
                                     ed_male_ba = "B15002_015",
                                     ed_male_ma = "B15002_016",
                                     ed_male_prof = "B15002_017",
                                     ed_male_phd = "B15002_018",
                                     ed_female_den = "B15002_019",
                                     ed_female_none = "B15002_020",
                                     ed_female_nt4 = "B15002_021",
                                     ed_female_5t6 = "B15002_022",
                                     ed_female_7t8 = "B15002_023",
                                     ed_female_9 = "B15002_024",
                                     ed_female_10 = "B15002_025",
                                     ed_female_11 = "B15002_026",
                                     ed_female_12_nd = "B15002_027",
                                     ed_female_12_d = "B15002_028",
                                     ed_female_sc1l = "B15002_029",
                                     ed_female_sc1m = "B15002_030",
                                     ed_female_as = "B15002_031",
                                     ed_female_ba = "B15002_032",
                                     ed_female_ma = "B15002_033",
                                     ed_female_prof = "B15002_034",
                                     ed_female_phd = "B15002_035",
                                     #gini = "B19083_001",
                                     mlf_1619 = "B23001_006",
                                     mup_1619 = "B23001_008",
                                     mlf_2021 = "B23001_013",
                                     mup_2021 = "B23001_015",
                                     mlf_2224 = "B23001_020",
                                     mup_2224 = "B23001_022",
                                     mlf_2529 = "B23001_027",
                                     mup_2529 = "B23001_029",
                                     mlf_3034 = "B23001_034",
                                     mup_3034 = "B23001_036",
                                     mlf_3544 = "B23001_041",
                                     mup_3544 = "B23001_043",
                                     mlf_4554 = "B23001_048",
                                     mup_4554 = "B23001_050",
                                     mlf_5559 = "B23001_055",
                                     mup_5559 = "B23001_057",
                                     mlf_6061 = "B23001_062",
                                     mup_6061 = "B23001_064",
                                     mlf_6264 = "B23001_069",
                                     mup_6264 = "B23001_071",
                                     mlf_6569 = "B23001_074",
                                     mup_6569 = "B23001_076",
                                     mlf_7074 = "B23001_079",
                                     mup_7074 = "B23001_081",
                                     mlf_75u = "B23001_084",
                                     mup_75u = "B23001_086",
                                     flp_1619 = "B23001_092",
                                     fup_1619 = "B23001_094",
                                     flp_2021 = "B23001_099",
                                     fup_2021 = "B23001_101",
                                     flp_2224 = "B23001_106",
                                     fup_2224 = "B23001_108",
                                     flp_2529 = "B23001_113",
                                     fup_2529 = "B23001_115",
                                     flp_3034 = "B23001_120",
                                     fup_3034 = "B23001_122",
                                     flp_3544 = "B23001_127",
                                     fup_3544 = "B23001_129",
                                     flp_4554 = "B23001_134",
                                     fup_4554 = "B23001_136",
                                     flp_5559 = "B23001_141",
                                     fup_5559 = "B23001_143",
                                     flp_6061 = "B23001_148",
                                     fup_6061 = "B23001_150",
                                     flp_6264 = "B23001_155",
                                     fup_6264 = "B23001_157",
                                     flp_6569 = "B23001_160",
                                     fup_6569 = "B23001_162",
                                     flp_7074 = "B23001_165",
                                     fup_7074 = "B23001_167",
                                     flp_75u = "B23001_170",
                                     fup_75u = "B23001_172"),
                       state = st,
                       survey = "acs5",
                       output = "wide",
                       year = 2009)
  
  ## add state identifying variable ##
  cosub2009$state <- st
  
  ## store the data frame in the list ## 
  state.cosubs.2009[[state.counter]] <- cosub2009
  
  ## increase interval by 1 ## 
  state.counter <- state.counter + 1
  
}

#####################
## now, 2009 munis ##
#####################

all.pls.2009 <- bind_rows(state.places.2009)

## add missing variables for pls ##

all.pls.2009.rest.in <- read.csv("pls2009.csv",
                                  header=T,
                                  stringsAsFactors = F)

all.pls.2009.rest <- all.pls.2009.rest.in %>%
  rename(gini = SE_A14028_001) %>%
  mutate(GEOID = str_pad(Geo_FIPS,7,"left","0")) %>%
  select(GEOID,
         gini) 

length(unique(all.pls.2009$GEOID)) == nrow(all.pls.2009)
range(nchar(trim(all.pls.2009$GEOID)))

length(unique(all.pls.2009.rest$GEOID)) == nrow(all.pls.2009.rest)
range(nchar(trim(all.pls.2009.rest$GEOID)))


all.pls.acs.2009.final.m <- stata.merge(all.pls.2009,
                                        all.pls.2009.rest,
                                        "GEOID")

## check merge ##
table(all.pls.acs.2009.final.m$merge.variable, useNA = "ifany")

## keep matches ##
all.pls.2009.final <- all.pls.acs.2009.final.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable,
         -ends_with("M"))

## Tribune, KS switches to Greeley unified gov in 2009 ##
all.pls.2009.final$GEOID[all.pls.2009.final$GEOID == "2071450"] <- "2028412"
all.pls.2009.final$NAME[all.pls.2009.final$NAME == "Tribune city, Kansas"] <- "Greeley County unified government (balance), Kansas"

## add missing variables for cosubs ##

all.cosubs.2009 <- bind_rows(state.cosubs.2009)

all.cosubs.2009.rest.in <- read.csv("cousubs2009.csv",
                                    header=T,
                                    stringsAsFactors = F)

all.cosubs.2009.rest <- all.cosubs.2009.rest.in %>%
  rename(gini = SE_A14028_001) %>%
  mutate(GEOID = str_pad(Geo_FIPS,10,"left","0")) %>%
  select(GEOID,
         gini)

length(unique(all.cosubs.2009$GEOID)) == nrow(all.cosubs.2009)
range(nchar(trim(all.cosubs.2009$GEOID)))

length(unique(all.cosubs.2009.rest$GEOID)) == nrow(all.cosubs.2009.rest)
range(nchar(trim(all.cosubs.2009.rest$GEOID)))


all.cosubs.acs.2009.final.m <- stata.merge(all.cosubs.2009,
                                           all.cosubs.2009.rest,
                                           "GEOID")

## check merge ##
table(all.cosubs.acs.2009.final.m$merge.variable, useNA = "ifany")

## keep matches ##
all.cosubs.2009.final <- all.cosubs.acs.2009.final.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable,
         -ends_with("M"))

## reformat the cosub dataframe ## 

cosub.2009.rf <- all.cosubs.2009.final %>%
  filter(!grepl('CCD', NAME)) %>%
  mutate(geoid.f = paste(substr(GEOID,1,2),
                         substr(GEOID,6,10),
                         sep="")) %>%
  select(GEOID, 
         everything())

## filter to munis in the 2022 file ## 

pls.2009.rd <- all.pls.2009.final %>%
  mutate(GEOID = case_when(GEOID == "2500765" ~ "2500840",
                           GEOID == "2519330" ~ "2519370",
                           GEOID == "2577850" ~ "2577890",
                           GEOID == "2635560" ~ "2682453",
                           GEOID == "4740240" ~ "4764668",
                           GEOID == "1239075" ~ "1239081",
                           TRUE ~ GEOID)) %>%
  filter(GEOID %in% munis.final.2022$GEOID & 
         ## dropping the CDP versions of these munis ##
         ## also dropping Princeton borough, NJ to manually account for merger ##
         GEOID %notin% c("2527060","2365725","2524960","3460900")) %>%
  mutate(geoid.f = GEOID)

## change names of munis that changed names during the study period ##
pls.2009.rd$NAME[pls.2009.rd$GEOID == "4764668"] <- "Rocky Top, Tennessee"
pls.2009.rd$NAME[pls.2009.rd$GEOID == "1239081"] <- "Lake Worth Beach, Florida"


length(unique(pls.2009.rd$GEOID)) == nrow(pls.2009.rd)

## do some processing to account for merger of Princeton borough and township ##

## all other munis ##
cosub.2009.rf.v1 <- cosub.2009.rf %>%
  filter(GEOID %notin% c("3402160900","3402160915"))

nrow(cosub.2009.rf.v1) == nrow(cosub.2009.rf) - 2

## princeton merger ##
cosub.2009.rf.v2 <- cosub.2009.rf %>%
  filter(GEOID %in% c("3402160900","3402160915")) %>%
  mutate(GEOID = case_when(GEOID == "3402160915" ~ "3402160900",
                           TRUE ~ GEOID),
         geoid.f = case_when(geoid.f == "3460915" ~ "3460900",
                             TRUE ~ geoid.f))

## check ##
cosub.2009.rf.v2$GEOID
cosub.2009.rf.v2$geoid.f

## names ##
cosub.2009.rf.v2a <- cosub.2009.rf.v2 %>%
  group_by(GEOID, geoid.f) %>%
  summarize(n = n(), .groups = "drop") %>%
  select(-n) %>%
  mutate(NAME = "Princeton municipality, NJ",
         state = "NJ") 

## sum vars ##
cosub.2009.rf.v2b <- cosub.2009.rf.v2 %>%
  group_by(GEOID, geoid.f) %>%
  select(-median_ageE,
         -median_gross_rentE,
         -median_prop_valueE,
         -median_yr_strE,
         -median_hhld_incE) %>%
  summarize_if(is.numeric, sum, na.rm=T) %>%
  ungroup()

## median vars ##
cosub.2009.rf.v2c <- cosub.2009.rf.v2 %>%
  group_by(GEOID, geoid.f) %>%
  select(GEOID,
         geoid.f,
         pop_totalE,
         median_ageE,
         median_gross_rentE,
         median_prop_valueE,
         median_yr_strE,
         median_hhld_incE) %>%
  mutate(wt = pop_totalE/sum(pop_totalE,na.rm=T)) %>%
  summarize_at(vars(median_ageE:median_hhld_incE),
               funs(weighted.mean(., wt, na.rm=T))) %>%
  ungroup()

print("ROWS PRIOR TO JOIN")
nrow(cosub.2009.rf.v2a)
nrow(cosub.2009.rf.v2b)
nrow(cosub.2009.rf.v2c)

cosub.2009.rf.v2.fin <- cosub.2009.rf.v2a %>%
  left_join(cosub.2009.rf.v2b, c("GEOID","geoid.f")) %>%
  left_join(cosub.2009.rf.v2c, c("GEOID","geoid.f"))

print("ROWS AFTER JOIN")
print(cosub.2009.rf.v2.fin)

ncol(cosub.2009.rf.v2.fin) == ncol(cosub.2009.rf.v2)

## check from Kevin Ogoro on stackoverflow.com: https://stackoverflow.com/questions/24027605/determine-the-number-of-na-values-in-a-column ##
sapply(cosub.2009.rf.v2.fin, function(y) sum(length(which(is.na(y)))))


cosub.2009.frd <- rbind(cosub.2009.rf.v2.fin,
                        cosub.2009.rf.v1)

## check rows ##
nrow(cosub.2009.frd) == nrow(cosub.2009.rf) - 1

cosub.2009.rd <- cosub.2009.frd %>%
  mutate(GEOID = case_when(GEOID == "2501127025" ~ "2501127060",
                           GEOID == "2500901185" ~ "2500901260",
                           GEOID == "2502107665" ~ "2502107740",
                           GEOID == "2501724925" ~ "2501724960",
                           GEOID == "2502763270" ~ "2502763345",
                           GEOID == "2501352105" ~ "2501352144",
                           GEOID == "2500546575" ~ "2500546598",
                           GEOID == "5505909825" ~ "5505909800",
                           TRUE ~ GEOID),
         geoid.f = case_when(geoid.f == "2527025" ~ "2527060",
                             geoid.f == "2501185" ~ "2501260",
                             geoid.f == "2507665" ~ "2507740",
                             geoid.f == "2524925" ~ "2524960",
                             geoid.f == "2563270" ~ "2563345",
                             geoid.f == "2552105" ~ "2552144",
                             geoid.f == "2546575" ~ "2546598",
                             geoid.f == "5509825" ~ "5509800",
                             TRUE ~ geoid.f)) %>%
  filter(GEOID %in% munis.final.2022$GEOID_full & 
         geoid.f %notin% pls.2009.rd$GEOID) %>%
  select(GEOID, 
         geoid.f,
         everything())

length(unique(cosub.2009.rd$GEOID)) == nrow(cosub.2009.rd)

## check obs number ##
nrow(pls.2009.rd) + nrow(cosub.2009.rd)
nrow(munis.final.2022)

## stack the filtered 2009 munis ## 
munis.2009.rdcb <- rbind(pls.2009.rd,
                         cosub.2009.rd) %>%
  rename(GEOID_full = GEOID,
         GEOID = geoid.f)

## check unique ##
length(unique(munis.2009.rdcb$GEOID)) == nrow(munis.2009.rdcb)

## check for tricky munis ##
munis.2009.nmchk1 <- munis.2009.rdcb %>%
  select(GEOID,
         GEOID_full,
         NAME)

## check that these are all in the 2022 file ##

m0922.check <- stata.merge(munis.2009.rdcb,
                           munis.final.2022,
                           "GEOID")

## check merge ##
table(m0922.check$merge.variable, useNA = "ifany")

## look into these non-matches ##
m0922.nm <- m0922.check %>%
  filter(merge.variable == 2) %>%
  select(GEOID,
         ends_with(".y"))

## none of these are in the raw 2009 data ## 
m0922.nm.check1 <- filter(all.pls.2009, GEOID %in% m0922.nm$GEOID)
nrow(m0922.nm.check1)

m0922.nm.check2 <- filter(all.cosubs.2009, GEOID %in% m0922.nm$GEOID_full.y)
nrow(m0922.nm.check2)

## clean data ## 

munis.fin.cl.2009 <- munis.2009.rdcb %>%
  filter(!grepl('precinct', NAME) & pop_totalE >0) %>%
  rename_with(~str_remove(., 'E')) %>%
  rename(GEOID_full = GOID_full,
         GEOID = GOID,
         NAME = NAM) %>%
  mutate(rb = case_when(rb_den !=0 ~ rowSums(cbind(rb_num1,
                                                   rb_num2,
                                                   rb_num3,
                                                   rb_num4),na.rm=T)/rb_den,
                        rb_den == 0 ~ NA),
         per_18t34 = rowSums(cbind(male_18t19 , female_18t19 ,male_20 , female_20 ,
                              male_21 , female_21 , male_22t24 , female_22t24 , 
                              male_25t29 , female_25t29 , male_30t34 , female_30t34),na.rm=T)/age_den,
         per_35t64 = rowSums(cbind(male_35t39 , female_35t39 , male_40t44 , female_40t44 , 
                              male_45t49 , female_45t49 , male_50t54 , female_50t54 , 
                              male_55t59 , female_55t59 , male_60t61 , female_60t61 , 
                              male_62t64 , female_62t64),na.rm=T)/age_den,
         per_65over = rowSums(cbind(male_65t66 , female_65t66 , male_67t69 , female_67t69 , 
                               male_70t74 , female_70t74 , male_75t79 , female_75t79 , 
                               male_80t84 , female_80t84 , male_85over , female_85over),na.rm=T)/age_den,
         per_asian = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                          pop_total == 0 ~ 0),
         per_black = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                          pop_total == 0 ~ 0),
         per_latino = case_when(pop_total != 0 ~ pop_h/pop_total,
                           pop_total == 0 ~ 0),
         per_white = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                          pop_total == 0 ~ 0),
         per_aian = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                         pop_total == 0 ~ 0),
         per_other = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other,
                                                         pop_nh_nhpi,
                                                         pop_nh_multi),na.rm=T)/pop_total, 
                          pop_total == 0 ~ 0),
         log_asian = case_when(pop_nh_asian != 0 ~ log(1/per_asian),
                          pop_nh_asian == 0 ~ 0),
         log_black = case_when(pop_nh_black != 0 ~ log(1/per_black),
                          pop_nh_black == 0 ~ 0),
         log_latino = case_when(pop_h != 0 ~ log(1/per_latino),
                           pop_h == 0 ~ 0),
         log_white = case_when(pop_nh_white != 0 ~ log(1/per_white),
                          pop_nh_white == 0 ~ 0),
         log_aian = case_when(pop_nh_aian != 0 ~ log(1/per_aian),
                         pop_nh_aian == 0 ~ 0),
         log_other = case_when(rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) != 0 ~ log(1/per_other),
                          rowSums(cbind(pop_nh_other,pop_nh_nhpi,pop_nh_multi),na.rm=T) == 0 ~ 0),
         entropy_er = per_asian*log_asian + 
                 per_black*log_black + 
                 per_latino*log_latino + 
                 per_white*log_white +
                 per_aian*log_aian +
                 per_other*log_other,
         ## adjust $ vars for inflation (All Items CPI-U-RS Annual Averages) ##
         median_gross_rent = median_gross_rent * 437.6/316.7,
         median_prop_value = median_prop_value * 437.6/316.7,
         median_hhld_inc = median_hhld_inc * 437.6/316.7,
         hownrt = ifelse(hhs>0, oo_hu/hhs, NA),
         vacant_rt = ifelse(total_hu>0, total_vacant/total_hu,NA),
         per_child = ifelse(hhlds_child_den>0,hhlds_child_num/hhlds_child_den,NA),
         per_nomove = ifelse(hhld_move_den>0, hhld_nomove/hhld_move_den, NA),
         per_immigrant = ifelse(foreign_den > 0, foreign_born/foreign_den, NA),
         hhld_pov_rt = ifelse(hhld_pov_den >0, hhld_pov_num/hhld_pov_den, NA),
         per_bm = ifelse(ed_den>0,rowSums(cbind(ed_male_ba, ed_female_ba, ed_male_ma, ed_female_ma,
                                                 ed_male_prof, ed_female_prof, ed_male_phd, ed_female_phd),na.rm=T)/ed_den,NA),
         unem_rt = rowSums(cbind(mup_1619,fup_1619,mup_2021,fup_2021,mup_2224,
                            fup_2224,mup_2529,fup_2529,mup_3034,fup_3034,
                            mup_3544,fup_3544,mup_4554,fup_4554,mup_5559,
                            fup_5559,mup_6061,fup_6061,mup_6264,fup_6264,
                            mup_6569,fup_6569,mup_7074,fup_7074,mup_75u,fup_75u),na.rm=T)/rowSums(cbind(mlf_1619,flp_1619,mlf_2021,
                                                                                               flp_2021,mlf_2224,flp_2224,
                                                                                               mlf_2529,flp_2529,mlf_3034,
                                                                                               flp_3034,mlf_3544,flp_3544,
                                                                                               mlf_4554,flp_4554,mlf_5559,
                                                                                               flp_5559,mlf_6061,flp_6061, 
                                                                                               mlf_6264,flp_6264,mlf_6569,
                                                                                               flp_6569,mlf_7074,flp_7074,
                                                                                               mlf_75u,flp_75u),na.rm=T),
         phi1 = hhld_inc1/hhld_inc,
         phi2 = hhld_inc2/hhld_inc,
         phi3 = hhld_inc3/hhld_inc,
         phi4 = hhld_inc4/hhld_inc,
         phi5 = hhld_inc5/hhld_inc,
         phi6 = hhld_inc6/hhld_inc,
         phi7 = hhld_inc7/hhld_inc,
         phi8 = hhld_inc8/hhld_inc,
         phi9 = hhld_inc9/hhld_inc,
         phi10 = hhld_inc10/hhld_inc,
         phi11 = hhld_inc11/hhld_inc,
         phi12 = hhld_inc12/hhld_inc,
         phi13 = hhld_inc13/hhld_inc,
         phi14 = hhld_inc14/hhld_inc,
         phi15 = hhld_inc15/hhld_inc,
         phi16 = hhld_inc16/hhld_inc) %>%
  select(GEOID,
         GEOID_full,
         NAME,
         pop_total,
         per_18t34,
         per_35t64,
         per_65over,
         median_age,
         per_asian,
         per_black,
         per_latino,
         per_white,
         entropy_er,
         per_child,
         per_nomove,
         per_immigrant,
         oo_hu,
         hownrt,
         rb,
         median_gross_rent,
         median_prop_value,
         vacant_rt,
         median_yr_str,
         median_hhld_inc,
         per_bm,
         hhld_pov_rt,
         unem_rt,
         gini,
         starts_with("phi")) %>%
  rename_at(vars(-GEOID,-GEOID_full,-NAME),function(x) paste0(x,"_2009"))

## at this point, the data frame is unique by GEOID ##
## this properly distinguishes between places (7 digit GEOID) and cosubs (10 digit GEOID) ##
nrow(munis.fin.cl.2009) == length(unique(munis.fin.cl.2009$GEOID))
nrow(munis.fin.cl.2009) == length(unique(munis.fin.cl.2009$GEOID_full))

sum(nchar(munis.fin.cl.2009$GEOID)==7) + sum(nchar(munis.fin.cl.2009$GEOID) == 10) == nrow(munis.fin.cl.2009)

## now, merge on land and pop density info ##

nrow(munis.fin.cl.2009) == length(unique(munis.fin.cl.2009$GEOID))
class(munis.fin.cl.2009$GEOID)
range(nchar(trim(munis.fin.cl.2009$GEOID)))

## make same GEOID fixes for places as above ##

places.popdensity.2009.rf <- places.popdensity.2009 %>%
  ## dropping CDP versions ##
  filter(GEOID %notin% c("2524960","2527060","3460900")) %>%
  mutate(GEOID = case_when(GEOID == "2500765" ~ "2500840",
                           GEOID == "2519330" ~ "2519370",
                           GEOID == "2577850" ~ "2577890",
                           GEOID == "2635560" ~ "2682453",
                           GEOID == "2071450" ~ "2028412",
                           GEOID == "4740240" ~ "4764668",
                           GEOID == "1239075" ~ "1239081",
                           TRUE ~ GEOID))

nrow(places.popdensity.2009.rf) == length(unique(places.popdensity.2009.rf$GEOID))
class(places.popdensity.2009.rf$GEOID)
range(nchar(trim(places.popdensity.2009.rf$GEOID)))

## merge 1 ## 

munis.fin.wpd1.2009.m <- stata.merge(munis.fin.cl.2009,
                                     places.popdensity.2009.rf,
                                     "GEOID")

table(munis.fin.wpd1.2009.m$merge.variable, useNA = "ifany")


munis.fin.wpd1.2009 <- munis.fin.wpd1.2009.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  rename(pop_density_2009 = pop_density,
         land_area_2009 = area)

## merge 2 ##
## now, merge on density data for county subs ##

## make some GEOID corrections as above ##

nrow(munis.fin.cl.2009) == length(unique(munis.fin.cl.2009$GEOID_full))
class(munis.fin.cl.2009$GEOID_full)
range(nchar(trim(munis.fin.cl.2009$GEOID_full)))

## need to apply same princeton fix ##

cousubs.popdensity.2009.rf.v1 <- cousubs.popdensity.2009 %>%
  filter(GEOID_full %notin% c("3402160900","3402160915"))
  
nrow(cousubs.popdensity.2009.rf.v1) == nrow(cousubs.popdensity.2009) - 2

## princeton merger ##
cousubs.popdensity.2009.rf.v2 <- cousubs.popdensity.2009 %>%
  filter(GEOID_full %in% c("3402160900","3402160915")) %>%
  mutate(GEOID_full = case_when(GEOID_full == "3402160915" ~ "3402160900",
                                TRUE ~ GEOID_full)) %>%
  group_by(GEOID_full) %>%
  summarize(Geo_FIPS = first(Geo_FIPS) ,
            Geo_GEOID = first(Geo_GEOID),
            Geo_NAME = first(Geo_NAME),
            Geo_COUNTY = first(Geo_COUNTY),
            Geo_STATE = first(Geo_STATE),
            Geo_COUSUB = first(Geo_COUSUB),
            Geo_PLACE = first(Geo_PLACE),
            total_pop = sum(total_pop,na.rm=T),
            area = sum(area,na.rm=T), .groups = "drop") %>%
  mutate(pop_density = total_pop/area)

## check ##
cousubs.popdensity.2009.rf.v2$GEOID_full


cousubs.popdensity.2009.rf <- rbind(cousubs.popdensity.2009.rf.v1,
                                    cousubs.popdensity.2009.rf.v2)

## check rows ##
nrow(cousubs.popdensity.2009.rf) == nrow(cousubs.popdensity.2009) - 1
  
cousubs.popdensity.2009.rd <- cousubs.popdensity.2009.rf %>%
  ## Franklin, MA is also in the places pop density file ##
  ## dropping here to avoid duplicates ##
  filter(GEOID_full != "2502125100") %>%
  mutate(GEOID_full = case_when(GEOID_full == "2501127025" ~ "2501127060",
                           GEOID_full == "2500901185" ~ "2500901260",
                           GEOID_full == "2502107665" ~ "2502107740",
                           GEOID_full == "2501724925" ~ "2501724960",
                           #GEOID_full == "2502178865" ~ "2502178972",
                           #GEOID_full == "2502580930" ~ "2502581005",
                           GEOID_full == "2502763270" ~ "2502763345",
                           GEOID_full == "2501352105" ~ "2501352144",
                           GEOID_full == "2500546575" ~ "2500546598",
                           GEOID_full == "5505909825" ~ "5505909800",
                           TRUE ~ GEOID_full))

nrow(cousubs.popdensity.2009.rd) == length(unique(cousubs.popdensity.2009.rd$GEOID_full))
class(cousubs.popdensity.2009.rd$GEOID_full)
range(nchar(trim(cousubs.popdensity.2009.rd$GEOID_full)))

## merge ## 
munis.fin.wpd2.2009.m <- stata.merge(munis.fin.cl.2009,
                                     cousubs.popdensity.2009.rd,
                                     "GEOID_full")

table(munis.fin.wpd2.2009.m$merge.variable, useNA = "ifany")

munis.fin.wpd2.2009 <- munis.fin.wpd2.2009.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  rename(pop_density_2009 = pop_density,
         land_area_2009 = area)

## combine all obs with density info ##

munis.fin.wpd.2009 <- rbind(munis.fin.wpd1.2009,
                            munis.fin.wpd2.2009)

## obs check ##
nrow(munis.fin.wpd.2009)
nrow(munis.fin.cl.2009)

nrow(munis.fin.wpd.2009) == length(unique(munis.fin.wpd.2009$GEOID))
nrow(munis.fin.wpd.2009) == length(unique(munis.fin.wpd.2009$GEOID_full))

## create final 2009 muni file ## 

munis.final.2009 <- munis.fin.wpd.2009 %>%
  ## drop remaining invalid CDPs ##
  filter(GEOID %notin% c("4906700","4950150","0284000")) %>%
  mutate(GEOID = case_when(GEOID == "2500765" ~ "2500840",
                           TRUE ~ as.character(GEOID)),
         GEOID_full = case_when(GEOID == "2527060" ~ "2501127060",
                                TRUE ~ GEOID_full)) %>%
  select(GEOID,
         GEOID_full,
         NAME,
         ends_with("_2009"))

## still unique? ##
nrow(munis.final.2009) == length(unique(munis.final.2009$GEOID))
nrow(munis.final.2009) == length(unique(munis.final.2009$GEOID_full))

## double check the CDPs ##
cdps.2009 <- filter(munis.final.2009, grepl("CDP", NAME))

## check the tricky munis ##
muni.names.2009 <- munis.final.2009 %>%
  select(GEOID,
         GEOID_full,
         NAME)

## check data ##
summary(munis.final.2009)


## how many are metro munis? ## 

range(nchar(munis.final.2009$GEOID))
range(nchar(pl.metro$GEOID))

length(unique(munis.final.2009$GEOID)) == nrow(munis.final.2009)
length(unique(pl.metro$GEOID)) == nrow(pl.metro)

ptm.2009 <- stata.merge(munis.final.2009,
                        pl.metro,
                        "GEOID")

table(ptm.2009$merge.variable, useNA = "ifany")

ptm.2009.keep <- ptm.2009 %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  select(GEOID) %>%
  unique()

cs.fm.2009 <- ptm.2009 %>%
  filter(merge.variable == 1) %>%
  select(-merge.variable)

range(nchar(cs.fm.2009$GEOID))
range(nchar(cs.metro$GEOID))

length(unique(cs.fm.2009$GEOID)) == nrow(cs.fm.2009)
length(unique(cs.metro$GEOID)) == nrow(cs.metro)

ctm.2009 <- stata.merge(cs.fm.2009,
                        cs.metro,
                        "GEOID")

table(ctm.2009$merge.variable, useNA = "ifany")

ctm.2009.keep <- ctm.2009 %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  select(GEOID) %>%
  unique()

mtm.2009 <- rbind(ptm.2009.keep,
                  ctm.2009.keep)

nrow(mtm.2009)
length(unique(mtm.2009$GEOID)) ==  nrow(mtm.2009)

print("ROWS PRIOR TO JOIN")
nrow(mtm.2009)
nrow(mtm.2022)

nrow(mtm.2009) == nrow(mtm.2022)

msa.munis <- mtm.2009 %>%
  inner_join(mtm.2022, "GEOID")

print("ROWS AFTER JOIN")
nrow(msa.munis)

save(msa.munis,
     file = paste(output_path,
                  "004_msa_muni_geoids.Rda",
                  sep=""))

## save data for all munis ##
save(munis.final.2009,
     file = paste(output_path,
                  "004_allmunis_2009.Rda",
                  sep=""))

######################
## now, 2009 tracts ##
######################

## combine all the data ## 
all.tracts.2009.cb <- bind_rows(state.tracts.2009)

## convert tracts to 2010 boundaries ##

## pre-merge checks ##

class(all.tracts.2009.cb$GEOID)
range(nchar(trim(all.tracts.2009.cb$GEOID)))
nrow(all.tracts.2009.cb) == length(unique(all.tracts.2009.cb$GEOID))

class(ltdb.fm$GEOID)
range(nchar(trim(ltdb.fm$GEOID)))

## merge the files ## 

all.tracts.2009.updated <- stata.merge(all.tracts.2009.cb,
                                       ltdb.fm,
                                       "GEOID")

## diagnose merge ## 
table(all.tracts.2009.updated$merge.variable)

## keep only the matches and obs with non-zero weights ##

all.tracts.2009.match <- all.tracts.2009.updated %>%
  filter(merge.variable == 3 &
         weight != 0) %>%
  select(-merge.variable,
         -ends_with("M")) %>%
  rename_with(~str_remove(., 'E')) %>%
  rename(GEOID = GOID)

## change the GEOID ##
## some 2010 tract IDs are duplicated because of  ##
## tracts that changed from 2000 to 2010; these   ##
## tracts will be condensed into one observation  ##
## via a weighted average, with weights from LTDB ##

all.tracts.2009.pr1a <- all.tracts.2009.match %>%
  select(GEOID,
         trtid00,
         trtid10, 
         state,
         starts_with("pop"),
         ro_hhlds,
         starts_with("rb"),
         hhld_pov_den,
         hhld_pov_num,
         gq,
         weight) %>%
  mutate_at(vars(pop_total:gq),
            .funs = funs(. * weight))

all.tracts.2009.pr2a <- all.tracts.2009.pr1a %>% 
  group_by(trtid10) %>%
  summarize_at(vars(pop_total:gq),
               funs(sum(., na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID = trtid10) 

## now more complicated vars ##

all.tracts.2009.pr2b <- all.tracts.2009.match %>%
  select(GEOID,
         trtid00,
         trtid10,
         state,
         median_gross_rent,
         median_prop_value,
         weight) %>%
  group_by(trtid10) %>%
  summarize_at(vars(median_gross_rent:median_prop_value),
               funs(weighted.mean(., weight, na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID = trtid10)


## check obs ## 
print("ROWS PRIOR TO JOIN")
nrow(all.tracts.2009.pr2a)
nrow(all.tracts.2009.pr2b)


all.tracts.2009 <- all.tracts.2009.pr2a %>%
  inner_join(all.tracts.2009.pr2b) %>%
  mutate(FIPS = substr(GEOID,1,5),
         gq_per = ifelse(pop_total > 0, gq/pop_total, NA),
         per_asian = case_when(pop_total != 0 ~ pop_nh_asian/pop_total,
                               pop_total == 0 ~ 0),
         per_black = case_when(pop_total != 0 ~ pop_nh_black/pop_total,
                               pop_total == 0 ~ 0),
         per_latino = case_when(pop_total != 0 ~ pop_h/pop_total,
                                pop_total == 0 ~ 0),
         per_white = case_when(pop_total != 0 ~ pop_nh_white/pop_total,
                               pop_total == 0 ~ 0),
         per_aian = case_when(pop_total != 0 ~ pop_nh_aian/pop_total, 
                              pop_total == 0 ~ 0),
         per_other = case_when(pop_total != 0 ~ rowSums(cbind(pop_nh_other,
                                                              pop_nh_nhpi,
                                                              pop_nh_multi),na.rm=T)/pop_total, 
                               pop_total == 0 ~ 0)) %>%
  filter(pop_total >= 500 & ro_hhlds >= 10 & gq_per <= 0.5) 

print("ROWS AFTER JOIN")
nrow(all.tracts.2009)

summary(all.tracts.2009)
length(unique(all.tracts.2009$GEOID)) == nrow(all.tracts.2009)
length(unique(all.tracts.2009$FIPS)) == nrow(all.tracts.2009)
length(unique(msa.del.2020.rd$FIPS)) == nrow(msa.del.2020.rd)

## what is the 2009 poverty rate for metro tracts? ##

metro.tracts.2009.m <- stata.merge(all.tracts.2009,
                                   msa.del.2020.rd, 
                                   "FIPS")

## check merge ##
table(metro.tracts.2009.m$merge.variable, useNA = "ifany")

## keep matches ##
metro.tracts.2009 <- metro.tracts.2009.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(hhld_pov_rt = ifelse(hhld_pov_den > 0, hhld_pov_num/hhld_pov_den, NA))

summary(metro.tracts.2009)


## correct GEOIDs according to Census guidance ## 
## https://www2.census.gov/geo/pdfs/reference/Geography_Notes.pdf ##

## Pima County ## 
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "04019002701"] <- "04019002704"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "04019002903"] <- "04019002906"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "04019410501"] <- "04019004118"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "04019410502"] <- "04019004121"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "04019410503"] <- "04019004125"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "04019470400"] <- "04019005200"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "04019470500"] <- "04019005300"

## LA County ##
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "06037930401"] <- "06037137000"

## NY ## 
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36053940101"] <- "36053030101"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36053940102"] <- "36053030102"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36053940103"] <- "36053030103"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36053940200"] <- "36053030200"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36053940300"] <- "36053030300"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36053940401"] <- "36053030401"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36053940403"] <- "36053030403"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36053940600"] <- "36053030600"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36053940700"] <- "36053030402"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36065940000"] <- "36065024800"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36065940100"] <- "36065024700"
metro.tracts.2009$GEOID[metro.tracts.2009$GEOID == "36065940200"] <- "36065024900"

###############################
## now, process outcome data ##
###############################

## above average poverty tracts ##

npov.tracts.2009.temp <- metro.tracts.2009 %>%
  mutate(rb = case_when(rb_den !=0 ~ rowSums(cbind(rb_num1,
                                                   rb_num2,
                                                   rb_num3,
                                                   rb_num4),na.rm=T)/rb_den,
                        rb_den == 0 ~ NA),
         ## adjust $ vars for inflation (All Items CPI-U-RS Annual Averages) ##
         mgr_ia = median_gross_rent * 437.6/316.7,
         mpv_ia = median_prop_value * 437.6/316.7) %>%
  select(GEOID,
         pop_total,
         ro_hhlds,
         rb_num1,
         rb_num2,
         rb_num3,
         rb_num4,
         rb_den,
         rb,
         hhld_pov_rt,
         mgr_ia,
         mpv_ia) 

print("ROWS PRIOR TO JOIN")
nrow(npov.tracts.2009.temp)
nrow(npov.tracts.2022)

npov.tracts.2009.fin <- npov.tracts.2009.temp %>%
  inner_join(select(npov.tracts.2022, "GEOID"), "GEOID")

print("ROWS AFTER JOIN")
nrow(npov.tracts.2009.fin)

npov.tracts.2009.alt <- npov.tracts.2009.temp %>%
  filter(hhld_pov_rt >= 0.13)

## num of 2009 tracts dropped/gained ## 
nrow(npov.tracts.2009.alt) - nrow(npov.tracts.2009.fin)
(nrow(npov.tracts.2009.alt) - nrow(npov.tracts.2009.fin))/nrow(npov.tracts.2009.fin)

## now, keep only 2022 tracts for which 2009 tract data exists ## 

print("ROWS PRIOR TO JOIN")
nrow(npov.tracts.2022)
nrow(npov.tracts.2009.fin)

npov.tracts.2022.fin <- npov.tracts.2022 %>%
  inner_join(select(npov.tracts.2009.fin, "GEOID"), "GEOID")

print("ROWS AFTER JOIN")
nrow(npov.tracts.2022.fin)

npov.tracts.2022.alt <- npov.tracts.2022

## num of 2022 tracts dropped/gained ## 
nrow(npov.tracts.2022.alt) - nrow(npov.tracts.2022.fin)
(nrow(npov.tracts.2022.alt) - nrow(npov.tracts.2022.fin))/nrow(npov.tracts.2022.alt)

## get high poverty tracts ##

hpov.tracts.2009.temp <- metro.tracts.2009 %>%
  mutate(rb = case_when(rb_den !=0 ~ rowSums(cbind(rb_num1,
                                                   rb_num2,
                                                   rb_num3,
                                                   rb_num4),na.rm=T)/rb_den,
                        rb_den == 0 ~ NA),
         hhld_pov_rt = ifelse(hhld_pov_den > 0, hhld_pov_num/hhld_pov_den, NA),
         ## adjust $ vars for inflation (All Items CPI-U-RS Annual Averages) ##
         mgr_ia = median_gross_rent * 437.6/316.7,
         mpv_ia = median_prop_value * 437.6/316.7) %>%
  select(GEOID,
         pop_total,
         ro_hhlds,
         rb_num1,
         rb_num2,
         rb_num3,
         rb_num4,
         rb_den,
         rb,
         mgr_ia,
         mpv_ia,
         hhld_pov_rt)

print("ROWS PRIOR TO JOIN")
nrow(hpov.tracts.2009.temp)
nrow(hpov.tracts.2022)

hpov.tracts.2009.fin <- hpov.tracts.2009.temp %>%
  inner_join(select(hpov.tracts.2022, "GEOID"), "GEOID")

print("ROWS AFTER JOIN")
nrow(hpov.tracts.2009.fin)

hpov.tracts.2009.alt <- hpov.tracts.2009.temp %>%
  filter(hhld_pov_rt >= 0.4)

## num of 2009 tracts dropped ## 
nrow(hpov.tracts.2009.alt) - nrow(hpov.tracts.2009.fin)
(nrow(hpov.tracts.2009.alt) - nrow(hpov.tracts.2009.fin))/nrow(hpov.tracts.2009.alt)

## now, keep only 2022 tracts for which 2009 tract data exists ## 

print("ROWS PRIOR TO JOIN")
nrow(hpov.tracts.2022)
nrow(hpov.tracts.2009.fin)

hpov.tracts.2022.fin <- hpov.tracts.2022 %>%
  inner_join(select(hpov.tracts.2009.fin, "GEOID"), "GEOID")

print("ROWS AFTER JOIN")
nrow(hpov.tracts.2022.fin)

hpov.tracts.2022.alt <- hpov.tracts.2022

## num of 2022 tracts dropped/gained ## 
nrow(hpov.tracts.2022.alt) - nrow(hpov.tracts.2022.fin)
(nrow(hpov.tracts.2022.alt) - nrow(hpov.tracts.2022.fin))/nrow(hpov.tracts.2022.alt)


## save tract level files ##

save(npov.tracts.2022.fin,
     file = paste0(output_path,
                   "004_npov_tracts_2022.Rda"))

save(hpov.tracts.2022.fin,
     file = paste0(output_path,
                   "004_hpov_tracts_2022.Rda"))

save(npov.tracts.2009.fin,
     file = paste0(output_path,
                   "004_npov_tracts_2009.Rda"))

save(hpov.tracts.2009.fin,
     file = paste0(output_path,
                   "004_hpov_tracts_2009.Rda"))

## alt definitions ##

save(npov.tracts.2022.alt,
     file = paste0(output_path,
                   "004_npov_tracts_alt_2022.Rda"))

save(hpov.tracts.2022.alt,
     file = paste0(output_path,
                   "004_hpov_tracts_alt_2022.Rda"))

save(npov.tracts.2009.alt,
     file = paste0(output_path,
                   "004_npov_tracts_alt_2009.Rda"))

save(hpov.tracts.2009.alt,
     file = paste0(output_path,
                   "004_hpov_tracts_alt_2009.Rda"))

#######################################################################
## finally, assign above-average poverty tracts to places and cosubs ##
#######################################################################

## MTM 01/07/2025: verified that median gross rent and median prop value calculations are correct ##

##########
## 2009 ##
##########

tpm <- tract.to.pl.metro %>%
  filter(GEOID_muni %in% mtm.2022$GEOID)

length(unique(tpm$GEOID_muni))
length(unique(tpm$GEOID_muni)) == nrow(ptm.2009.keep)
length(unique(tpm$GEOID_muni)) == nrow(ptm.2022.keep)

tcm <- tract.to.cs.metro %>%
  filter(GEOID_muni %in% mtm.2022$GEOID & 
         GEOID_muni %notin% tpm$GEOID_muni)

length(unique(tcm$GEOID_muni))
length(unique(tcm$GEOID_muni_full)) 

length(unique(tcm$GEOID_muni)) == nrow(ctm.2009.keep)
length(unique(tcm$GEOID_muni_full)) == nrow(ctm.2022.keep)

## places first ##

npov.mpl.2009.m <- stata.merge(npov.tracts.2009.fin,
                               tpm, 
                               "GEOID")

## check merge ## 
table(npov.mpl.2009.m$merge.variable, useNA = "ifany")

npov.mpl.2009 <- npov.mpl.2009.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(rnt_wt = ro_hhlds * afact_num,
         type = "place") %>%
  rename(muni_fips = placefp,
         muni_name = placenm) %>%
  select(GEOID,
         GEOID_muni,
         cbsa10,
         type,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         mgr_ia,
         mpv_ia,
         FIPS,
         afact,
         muni_fips,
         muni_name,
         afact_num,
         rnt_wt)

## now cousubs ##

npov.tracts.2009.rm <- npov.mpl.2009.m %>%
  filter(merge.variable == 1) %>%
  select(GEOID,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         hhld_pov_rt,
         mgr_ia,
         mpv_ia)

npov.mcs.2009.m <- stata.merge(npov.tracts.2009.rm,
                               tcm, 
                               "GEOID")

## check merge ## 
table(npov.mcs.2009.m$merge.variable, useNA = "ifany")

npov.mcs.2009 <- npov.mcs.2009.m %>%
  filter(merge.variable == 3 & 
         GEOID_muni %notin% npov.mpl.2009$GEOID_muni) %>%
  select(-merge.variable) %>%
  mutate(rnt_wt = ro_hhlds * afact_num,
         type = "cousub") %>%
  rename(muni_fips = cousubfp,
         muni_name = cousubnm) %>%
  select(GEOID,
         GEOID_muni,
         cbsa10,
         type,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         mgr_ia,
         mpv_ia,
         FIPS,
         afact,
         muni_fips,
         muni_name,
         afact_num,
         rnt_wt)

## stack ##

npov.trts.all.2009 <- rbind(npov.mpl.2009,
                            npov.mcs.2009) %>%
  mutate(GEOID_muni_full = case_when(type == "place" ~ paste0(substr(FIPS,1,2),muni_fips),
                                     type == "cousub" ~ paste0(FIPS,muni_fips)))

length(unique(npov.trts.all.2009$GEOID)) == length(unique(npov.tracts.2009.fin$GEOID))


npov.2009.miss <- npov.tracts.2009.fin %>%
  filter(GEOID %notin% npov.trts.all.2009$GEOID)

npov.2009.miss.pl1 <- tpm %>%
  filter(GEOID %in% npov.2009.miss$GEOID)

npov.2009.miss.pl <- npov.2009.miss.pl1 %>%
  mutate(GEOID = paste0(substr(FIPS,1,2),
                        placefp),
         GEOID_full = paste0(FIPS,
                             placefp)) %>%
  rename(name = placenm) %>%
  select(GEOID,
         GEOID_full,
         name)

npov.2009.miss.cs <- tcm %>%
  filter(GEOID %in% npov.2009.miss$GEOID & 
         GEOID %notin% npov.2009.miss.pl1$GEOID) %>%
  mutate(GEOID = paste0(substr(FIPS,1,2),
                        cousubfp),
         GEOID_full = paste0(FIPS,
                             cousubfp)) %>%
  rename(name = cousubnm) %>%
  select(GEOID,
         GEOID_full,
         name)

npov.2009.miss.muni <- rbind(npov.2009.miss.pl,
                             npov.2009.miss.cs) %>%
  unique()

npov.2009.miss.check <- npov.2009.miss.muni %>%
  filter(!grepl("CDP|CCD|district|District", name) & 
         GEOID %notin% munis.gov.nm2$GEOID & 
         !(grepl("Township|township", name) & substr(GEOID,1,2) == "05") & 
         !(grepl("Township|township", name) & substr(GEOID,1,2) == "37"))


npov.trts.all.2009$GEOID_muni1 <- paste0(npov.trts.all.2009$FIPS,
                                         npov.trts.all.2009$muni_fips)

npov.trts.all.2009$GEOID_muni2 <- paste0(substr(npov.trts.all.2009$FIPS,1,2),
                                                npov.trts.all.2009$muni_fips)

summary(npov.trts.all.2009)
range(nchar(trim(npov.trts.all.2009$muni_fips)))
range(nchar(trim(npov.trts.all.2009$GEOID_muni_full)))
length(unique(npov.trts.all.2009$GEOID)) == nrow(npov.trts.all.2009)
length(unique(npov.trts.all.2009$muni_fips)) == nrow(npov.trts.all.2009)
length(unique(npov.trts.all.2009$GEOID_muni_full)) == nrow(npov.trts.all.2009)

save(npov.trts.all.2009,
     file = paste0(output_path,
                   "004_npov_tracts_mm_2009.Rda"))

## aggregate ## 

npov.muni.2009 <- npov.trts.all.2009 %>%
  left_join(all.tracts.2000.pr, "GEOID") %>%
  group_by(GEOID_muni_full) %>%
  mutate(pov_rnt_pop = sum(rnt_wt,na.rm=T),
         wt =  ifelse(pov_rnt_pop > 0, rnt_wt/pov_rnt_pop, NA)) %>%
  select(starts_with("GEOID"),
         rb,
         mgr_ia,
         mpv_ia,
         mgr_2000,
         mpv_2000,
         wt) %>%
  summarize_at(vars(rb:mpv_2000),
               funs(weighted.mean(., wt, na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID_full = GEOID_muni_full,
         mgr = mgr_ia,
         mpv = mpv_ia)

summary(npov.muni.2009)
length(unique(npov.muni.2009$GEOID_full)) == nrow(npov.muni.2009)

save(npov.muni.2009,
     file = paste0(output_path,
                   "004_npov_muni_2009.Rda"))

## assign high-poverty tracts to places and cosubs ##

## places first ##

hpov.mpl.2009.m <- stata.merge(hpov.tracts.2009.fin,
                               tpm, 
                               "GEOID")

## check merge ## 
table(hpov.mpl.2009.m$merge.variable, useNA = "ifany")

hpov.mpl.2009 <- hpov.mpl.2009.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(rnt_wt = ro_hhlds * afact_num,
         type = "place") %>%
  rename(muni_fips = placefp,
         muni_name = placenm) %>%
  select(GEOID,
         GEOID_muni,
         cbsa10,
         type,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         mgr_ia,
         mpv_ia,
         FIPS,
         afact,
         muni_fips,
         muni_name,
         afact_num,
         rnt_wt)

## now cousubs ##

hpov.tracts.2009.rm <- hpov.mpl.2009.m %>%
  filter(merge.variable == 1) %>%
  select(GEOID,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         hhld_pov_rt,
         mgr_ia,
         mpv_ia)

hpov.mcs.2009.m <- stata.merge(hpov.tracts.2009.rm,
                               tcm, 
                               "GEOID")

## check merge ## 
table(hpov.mcs.2009.m$merge.variable, useNA = "ifany")

hpov.mcs.2009 <- hpov.mcs.2009.m %>%
  filter(merge.variable == 3 & 
           GEOID_muni %notin% hpov.mpl.2009$GEOID_muni) %>%
  select(-merge.variable) %>%
  mutate(rnt_wt = ro_hhlds * afact_num,
         type = "cousub") %>%
  rename(muni_fips = cousubfp,
         muni_name = cousubnm) %>%
  select(GEOID,
         GEOID_muni,
         cbsa10,
         type,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         mgr_ia,
         mpv_ia,
         FIPS,
         afact,
         muni_fips,
         muni_name,
         afact_num,
         rnt_wt)

## stack ##

hpov.trts.all.2009 <- rbind(hpov.mpl.2009,
                            hpov.mcs.2009) %>%
  mutate(GEOID_muni_full = case_when(type == "place" ~ paste0(substr(FIPS,1,2),muni_fips),
                                     type == "cousub" ~ paste0(FIPS,muni_fips)))

length(unique(hpov.trts.all.2009$GEOID)) == length(unique(hpov.tracts.2009.fin$GEOID))


hpov.2009.miss <- hpov.tracts.2009.fin %>%
  filter(GEOID %notin% hpov.trts.all.2009$GEOID)

hpov.2009.miss.pl1 <- tpm %>%
  filter(GEOID %in% hpov.2009.miss$GEOID)

hpov.2009.miss.pl <- hpov.2009.miss.pl1 %>%
  mutate(GEOID = paste0(substr(FIPS,1,2),
                        placefp),
         GEOID_full = paste0(FIPS,
                             placefp)) %>%
  rename(name = placenm) %>%
  select(GEOID,
         GEOID_full,
         name)

hpov.2009.miss.cs <- tcm %>%
  filter(GEOID %in% hpov.2009.miss$GEOID & 
           GEOID %notin% hpov.2009.miss.pl1$GEOID) %>%
  mutate(GEOID = paste0(substr(FIPS,1,2),
                        cousubfp),
         GEOID_full = paste0(FIPS,
                             cousubfp)) %>%
  rename(name = cousubnm) %>%
  select(GEOID,
         GEOID_full,
         name)

hpov.2009.miss.muni <- rbind(hpov.2009.miss.pl,
                             hpov.2009.miss.cs) %>%
  unique()

hpov.2009.miss.check <- hpov.2009.miss.muni %>%
  filter(!grepl("CDP|CCD|district|District", name) & 
           GEOID %notin% munis.gov.nm2$GEOID & 
           !(grepl("Township|township", name) & substr(GEOID,1,2) == "05") & 
           !(grepl("Township|township", name) & substr(GEOID,1,2) == "37"))


hpov.trts.all.2009$GEOID_muni1 <- paste0(hpov.trts.all.2009$FIPS,
                                         hpov.trts.all.2009$muni_fips)

hpov.trts.all.2009$GEOID_muni2 <- paste0(substr(hpov.trts.all.2009$FIPS,1,2),
                                         hpov.trts.all.2009$muni_fips)

summary(hpov.trts.all.2009)
range(nchar(trim(hpov.trts.all.2009$muni_fips)))
range(nchar(trim(hpov.trts.all.2009$GEOID_muni_full)))
length(unique(hpov.trts.all.2009$GEOID)) == nrow(hpov.trts.all.2009)
length(unique(hpov.trts.all.2009$muni_fips)) == nrow(hpov.trts.all.2009)
length(unique(hpov.trts.all.2009$GEOID_muni_full)) == nrow(hpov.trts.all.2009)

save(hpov.trts.all.2009,
     file = paste0(output_path,
                   "004_hpov_tracts_mm_2009.Rda"))



## aggregate ## 

hpov.muni.2009 <- hpov.trts.all.2009 %>%
  left_join(all.tracts.2000.pr, "GEOID") %>%
  group_by(GEOID_muni_full) %>%
  mutate(pov_rnt_pop = sum(rnt_wt,na.rm=T),
         wt =  ifelse(pov_rnt_pop > 0, rnt_wt/pov_rnt_pop, NA)) %>%
  select(starts_with("GEOID"),
         rb,
         mgr_ia,
         mpv_ia,
         mgr_2000,
         mpv_2000,
         wt) %>%
  summarize_at(vars(rb:mpv_2000),
               funs(weighted.mean(., wt, na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID_full = GEOID_muni_full,
         mgr = mgr_ia,
         mpv = mpv_ia)

summary(hpov.muni.2009)
length(unique(hpov.muni.2009$GEOID_full)) == nrow(hpov.muni.2009)

save(hpov.muni.2009,
     file = paste0(output_path,
                   "004_hpov_muni_2009.Rda"))

###############
## now, 2022 ##
###############

## assign above-average poverty tracts to places and cosubs ##

## places first ##

npov.mpl.2022.m <- stata.merge(npov.tracts.2022.fin,
                               tpm, 
                               "GEOID")

## check merge ## 
table(npov.mpl.2022.m$merge.variable, useNA = "ifany")

npov.mpl.2022 <- npov.mpl.2022.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(rnt_wt = ro_hhlds * afact_num,
         type = "place") %>%
  rename(muni_fips = placefp,
         muni_name = placenm) %>%
  select(GEOID,
         GEOID_muni,
         cbsa10,
         type,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         median_gross_rent,
         median_prop_value,
         FIPS,
         afact,
         muni_fips,
         muni_name,
         afact_num,
         rnt_wt)

## now cousubs ##

npov.tracts.2022.rm <- npov.mpl.2022.m %>%
  filter(merge.variable == 1) %>%
  select(GEOID,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         hhld_pov_rt,
         median_gross_rent,
         median_prop_value)

npov.mcs.2022.m <- stata.merge(npov.tracts.2022.rm,
                               tcm, 
                               "GEOID")

## check merge ## 
table(npov.mcs.2022.m$merge.variable, useNA = "ifany")

npov.mcs.2022 <- npov.mcs.2022.m %>%
  filter(merge.variable == 3 & 
         GEOID_muni %notin% npov.mpl.2022$GEOID_muni) %>%
  select(-merge.variable) %>%
  mutate(rnt_wt = ro_hhlds * afact_num,
         type = "cousub") %>%
  rename(muni_fips = cousubfp,
         muni_name = cousubnm) %>%
  select(GEOID,
         GEOID_muni,
         cbsa10,
         type,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         median_gross_rent,
         median_prop_value,
         FIPS,
         afact,
         muni_fips,
         muni_name,
         afact_num,
         rnt_wt)

## stack ##

npov.trts.all.2022 <- rbind(npov.mpl.2022,
                            npov.mcs.2022) %>%
  mutate(GEOID_muni_full = case_when(type == "place" ~ paste0(substr(FIPS,1,2),muni_fips),
                                     type == "cousub" ~ paste0(FIPS,muni_fips)))

length(unique(npov.trts.all.2022$GEOID)) == length(unique(npov.tracts.2022.fin$GEOID))


npov.2022.miss <- npov.tracts.2022.fin %>%
  filter(GEOID %notin% npov.trts.all.2022$GEOID)

npov.2022.miss.pl1 <- tpm %>%
  filter(GEOID %in% npov.2022.miss$GEOID)

npov.2022.miss.pl <- npov.2022.miss.pl1 %>%
  mutate(GEOID = paste0(substr(FIPS,1,2),
                        placefp),
         GEOID_full = paste0(FIPS,
                             placefp)) %>%
  rename(name = placenm) %>%
  select(GEOID,
         GEOID_full,
         name)

npov.2022.miss.cs <- tcm %>%
  filter(GEOID %in% npov.2022.miss$GEOID & 
           GEOID %notin% npov.2022.miss.pl1$GEOID) %>%
  mutate(GEOID = paste0(substr(FIPS,1,2),
                        cousubfp),
         GEOID_full = paste0(FIPS,
                             cousubfp)) %>%
  rename(name = cousubnm) %>%
  select(GEOID,
         GEOID_full,
         name)

npov.2022.miss.muni <- rbind(npov.2022.miss.pl,
                             npov.2022.miss.cs) %>%
  unique()

npov.2022.miss.check <- npov.2022.miss.muni %>%
  filter(!grepl("CDP|CCD|district|District", name) & 
           GEOID %notin% munis.gov.nm2$GEOID & 
           !(grepl("Township|township", name) & substr(GEOID,1,2) == "05") & 
           !(grepl("Township|township", name) & substr(GEOID,1,2) == "37"))


npov.trts.all.2022$GEOID_muni1 <- paste0(npov.trts.all.2022$FIPS,
                                         npov.trts.all.2022$muni_fips)

npov.trts.all.2022$GEOID_muni2 <- paste0(substr(npov.trts.all.2022$FIPS,1,2),
                                         npov.trts.all.2022$muni_fips)

summary(npov.trts.all.2022)
range(nchar(trim(npov.trts.all.2022$muni_fips)))
range(nchar(trim(npov.trts.all.2022$GEOID_muni_full)))
length(unique(npov.trts.all.2022$GEOID)) == nrow(npov.trts.all.2022)
length(unique(npov.trts.all.2022$muni_fips)) == nrow(npov.trts.all.2022)
length(unique(npov.trts.all.2022$GEOID_muni_full)) == nrow(npov.trts.all.2022)

save(npov.trts.all.2022,
     file = paste0(output_path,
                   "004_npov_tracts_mm_2022.Rda"))



## aggregate ## 

npov.muni.2022 <- npov.trts.all.2022 %>%
  left_join(all.tracts.2012, "GEOID") %>%
  group_by(GEOID_muni_full) %>%
  mutate(pov_rnt_pop = sum(rnt_wt,na.rm=T),
         wt =  ifelse(pov_rnt_pop > 0, rnt_wt/pov_rnt_pop, NA)) %>%
  select(starts_with("GEOID"),
         rb,
         median_gross_rent,
         median_prop_value,
         mgr_2012,
         mpv_2012,
         wt) %>%
  summarize_at(vars(rb:mpv_2012),
               funs(weighted.mean(., wt, na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID_full = GEOID_muni_full,
         mgr = median_gross_rent,
         mpv = median_prop_value)

summary(npov.muni.2022)
length(unique(npov.muni.2022$GEOID_full)) == nrow(npov.muni.2022)

save(npov.muni.2022,
     file = paste0(output_path,
                   "004_npov_muni_2022.Rda"))

## assign high-poverty tracts to places and cosubs ##

## places first ##

hpov.mpl.2022.m <- stata.merge(hpov.tracts.2022.fin,
                               tpm, 
                               "GEOID")

## check merge ## 
table(hpov.mpl.2022.m$merge.variable, useNA = "ifany")

hpov.mpl.2022 <- hpov.mpl.2022.m %>%
  filter(merge.variable == 3) %>%
  select(-merge.variable) %>%
  mutate(rnt_wt = ro_hhlds * afact_num,
         type = "place") %>%
  rename(muni_fips = placefp,
         muni_name = placenm) %>%
  select(GEOID,
         GEOID_muni,
         cbsa10,
         type,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         median_gross_rent,
         median_prop_value,
         FIPS,
         afact,
         muni_fips,
         muni_name,
         afact_num,
         rnt_wt)

## now cousubs ##

hpov.tracts.2022.rm <- hpov.mpl.2022.m %>%
  filter(merge.variable == 1) %>%
  select(GEOID,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         hhld_pov_rt,
         median_gross_rent,
         median_prop_value)

hpov.mcs.2022.m <- stata.merge(hpov.tracts.2022.rm,
                               tcm, 
                               "GEOID")

## check merge ## 
table(hpov.mcs.2022.m$merge.variable, useNA = "ifany")

hpov.mcs.2022 <- hpov.mcs.2022.m %>%
  filter(merge.variable == 3 & 
         GEOID_muni %notin% hpov.mpl.2022$GEOID_muni) %>%
  select(-merge.variable) %>%
  mutate(rnt_wt = ro_hhlds * afact_num,
         type = "cousub") %>%
  rename(muni_fips = cousubfp,
         muni_name = cousubnm) %>%
  select(GEOID,
         GEOID_muni,
         cbsa10,
         type,
         pop_total,
         ro_hhlds,
         starts_with("rb"),
         median_gross_rent,
         median_prop_value,
         FIPS,
         afact,
         muni_fips,
         muni_name,
         afact_num,
         rnt_wt)

## stack ##

hpov.trts.all.2022 <- rbind(hpov.mpl.2022,
                            hpov.mcs.2022) %>%
  mutate(GEOID_muni_full = case_when(type == "place" ~ paste0(substr(FIPS,1,2),muni_fips),
                                     type == "cousub" ~ paste0(FIPS,muni_fips)))

length(unique(hpov.trts.all.2022$GEOID)) == length(unique(hpov.tracts.2022.fin$GEOID))


hpov.2022.miss <- hpov.tracts.2022.fin %>%
  filter(GEOID %notin% hpov.trts.all.2022$GEOID)

hpov.2022.miss.pl1 <- tpm %>%
  filter(GEOID %in% hpov.2022.miss$GEOID)

hpov.2022.miss.pl <- hpov.2022.miss.pl1 %>%
  mutate(GEOID = paste0(substr(FIPS,1,2),
                        placefp),
         GEOID_full = paste0(FIPS,
                             placefp)) %>%
  rename(name = placenm) %>%
  select(GEOID,
         GEOID_full,
         name)

hpov.2022.miss.cs <- tcm %>%
  filter(GEOID %in% hpov.2022.miss$GEOID & 
           GEOID %notin% hpov.2022.miss.pl1$GEOID) %>%
  mutate(GEOID = paste0(substr(FIPS,1,2),
                        cousubfp),
         GEOID_full = paste0(FIPS,
                             cousubfp)) %>%
  rename(name = cousubnm) %>%
  select(GEOID,
         GEOID_full,
         name)

hpov.2022.miss.muni <- rbind(hpov.2022.miss.pl,
                             hpov.2022.miss.cs) %>%
  unique()

hpov.2022.miss.check <- hpov.2022.miss.muni %>%
  filter(!grepl("CDP|CCD|district|District", name) & 
           GEOID %notin% munis.gov.nm2$GEOID & 
           !(grepl("Township|township", name) & substr(GEOID,1,2) == "05") & 
           !(grepl("Township|township", name) & substr(GEOID,1,2) == "37"))


hpov.trts.all.2022$GEOID_muni1 <- paste0(hpov.trts.all.2022$FIPS,
                                         hpov.trts.all.2022$muni_fips)

hpov.trts.all.2022$GEOID_muni2 <- paste0(substr(hpov.trts.all.2022$FIPS,1,2),
                                         hpov.trts.all.2022$muni_fips)

summary(hpov.trts.all.2022)
range(nchar(trim(hpov.trts.all.2022$muni_fips)))
range(nchar(trim(hpov.trts.all.2022$GEOID_muni_full)))
length(unique(hpov.trts.all.2022$GEOID)) == nrow(hpov.trts.all.2022)
length(unique(hpov.trts.all.2022$muni_fips)) == nrow(hpov.trts.all.2022)
length(unique(hpov.trts.all.2022$GEOID_muni_full)) == nrow(hpov.trts.all.2022)

save(hpov.trts.all.2022,
     file = paste0(output_path,
                   "004_hpov_tracts_mm_2022.Rda"))


## aggregate ## 

hpov.muni.2022 <- hpov.trts.all.2022 %>%
  left_join(all.tracts.2012, "GEOID") %>%
  group_by(GEOID_muni_full) %>%
  mutate(pov_rnt_pop = sum(rnt_wt,na.rm=T),
         wt =  ifelse(pov_rnt_pop > 0, rnt_wt/pov_rnt_pop, NA)) %>%
  select(starts_with("GEOID"),
         rb,
         median_gross_rent,
         median_prop_value,
         mgr_2012,
         mpv_2012,
         wt) %>%
  summarize_at(vars(rb:mpv_2012),
               funs(weighted.mean(., wt, na.rm=T))) %>%
  ungroup() %>%
  rename(GEOID_full = GEOID_muni_full,
         mgr = median_gross_rent,
         mpv = median_prop_value)

summary(hpov.muni.2022)
length(unique(hpov.muni.2022$GEOID_full)) == nrow(hpov.muni.2022)

save(hpov.muni.2022,
     file = paste0(output_path,
                   "004_hpov_muni_2022.Rda"))

## END OF PROGRAM ##

#sink()
